Index: sys/contrib/rdma/krping/krping.h =================================================================== --- sys/contrib/rdma/krping/krping.h +++ sys/contrib/rdma/krping/krping.h @@ -14,7 +14,6 @@ char name[16]; }; -int krping_doit(char *, void *); +int krping_doit(char *); void krping_walk_cb_list(void (*)(struct krping_stats *, void *), void *); -void krping_init(void); int krping_sigpending(void); Index: sys/contrib/rdma/krping/krping.c =================================================================== --- sys/contrib/rdma/krping/krping.c +++ sys/contrib/rdma/krping/krping.c @@ -53,13 +53,14 @@ #include "krping.h" #include "getopt.h" +#define PFX "krping: " + extern int krping_debug; -#define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x) -#define PRINTF(cb, x...) log(LOG_INFO, x) +#define DEBUG_LOG(...) do { if (krping_debug) log(LOG_INFO, __VA_ARGS__); } while (0) #define BIND_INFO 1 MODULE_AUTHOR("Steve Wise"); -MODULE_DESCRIPTION("RDMA ping client/server"); +MODULE_DESCRIPTION("RDMA ping server"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(krping, 1); MODULE_DEPEND(krping, linuxkpi, 1, 1, 1); @@ -76,21 +77,19 @@ enum mem_type { DMA = 1, - FASTREG = 2, - MW = 3, - MR = 4 + REG = 2, }; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, {"size", OPT_INT, 'S'}, {"addr", OPT_STRING, 'a'}, + {"addr6", OPT_STRING, 'A'}, {"port", OPT_INT, 'p'}, {"verbose", OPT_NOPARAM, 'v'}, {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, - {"mem_mode", OPT_STRING, 'm'}, {"server_inv", OPT_NOPARAM, 'I'}, {"wlat", OPT_NOPARAM, 'l'}, {"rlat", OPT_NOPARAM, 'L'}, @@ -100,14 +99,14 @@ {"poll", OPT_NOPARAM, 'P'}, {"local_dma_lkey", OPT_NOPARAM, 'Z'}, {"read_inv", OPT_NOPARAM, 'R'}, - {"fr", OPT_INT, 'f'}, + {"fr", OPT_NOPARAM, 'f'}, {NULL, 0, 0} }; #define htonll(x) cpu_to_be64((x)) #define ntohll(x) cpu_to_be64((x)) -static struct mutex krping_mutex; +static DEFINE_MUTEX(krping_mutex); /* * List of running krping threads. @@ -115,6 +114,13 @@ static LIST_HEAD(krping_cbs); /* + * Invoke like this, one on each side, using the server's address on + * the RDMA device (iw%d): + * + * /bin/echo server,port=9999,addr=192.168.69.142,validate > /proc/krping + * /bin/echo client,port=9999,addr=192.168.69.142,validate > /proc/krping + * /bin/echo client,port=9999,addr6=2001:db8:0:f101::1,validate > /proc/krping + * * krping "ping/pong" loop: * client sends source rkey/addr/len * server receives source rkey/add/len @@ -163,42 +169,35 @@ * Control block struct. */ struct krping_cb { - void *cookie; int server; /* 0 iff client */ struct ib_cq *cq; struct ib_pd *pd; struct ib_qp *qp; - enum mem_type mem; struct ib_mr *dma_mr; struct ib_fast_reg_page_list *page_list; int page_list_len; - struct ib_send_wr fastreg_wr; + struct ib_reg_wr reg_mr_wr; struct ib_send_wr invalidate_wr; - struct ib_mr *fastreg_mr; + struct ib_mr *reg_mr; int server_invalidate; int read_inv; u8 key; - struct ib_mw *mw; - struct ib_mw_bind bind_attr; - struct ib_recv_wr rq_wr; /* recv work request record */ struct ib_sge recv_sgl; /* recv single SGE */ - struct krping_rdma_info recv_buf;/* malloc'd buffer */ + struct krping_rdma_info recv_buf __aligned(16); /* malloc'd buffer */ u64 recv_dma_addr; DECLARE_PCI_UNMAP_ADDR(recv_mapping) - struct ib_mr *recv_mr; struct ib_send_wr sq_wr; /* send work requrest record */ struct ib_sge send_sgl; - struct krping_rdma_info send_buf;/* single send buf */ + struct krping_rdma_info send_buf __aligned(16); /* single send buf */ u64 send_dma_addr; DECLARE_PCI_UNMAP_ADDR(send_mapping) - struct ib_mr *send_mr; - struct ib_send_wr rdma_sq_wr; /* rdma work request record */ + struct ib_rdma_wr rdma_sq_wr; /* rdma work request record */ struct ib_sge rdma_sgl; /* rdma single SGE */ char *rdma_buf; /* used as rdma sink */ u64 rdma_dma_addr; @@ -219,8 +218,9 @@ struct krping_stats stats; uint16_t port; /* dst port in NBO */ - struct in_addr addr; /* dst addr in NBO */ + u8 addr[16]; /* dst addr in NBO */ char *addr_str; /* dst addr string */ + uint8_t addr_type; /* ADDR_FAMILY - IPv4/V6 */ int verbose; /* verbose logging */ int count; /* ping count */ int size; /* ping data size */ @@ -232,8 +232,7 @@ int poll; /* poll or block for rlat test */ int txdepth; /* SQ depth */ int local_dma_lkey; /* use 0 for lkey */ - int frtest; /* fastreg test */ - int testnum; + int frtest; /* reg test */ /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ @@ -248,39 +247,34 @@ int ret; struct krping_cb *cb = cma_id->context; - DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event, - cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); + DEBUG_LOG("cma_event type %d cma_id %p (%s)\n", event->event, cma_id, + (cma_id == cb->cm_id) ? "parent" : "child"); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { - PRINTF(cb, "rdma_resolve_route error %d\n", ret); + printk(KERN_ERR PFX "rdma_resolve_route error %d\n", + ret); wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; - cb->child_cm_id = cma_id; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: - if (cb->state == IDLE) { - cb->state = CONNECT_REQUEST; - cb->child_cm_id = cma_id; - } else { - PRINTF(cb, "Received connection request in wrong state" - " (%d)\n", cb->state); - } - DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id); + cb->state = CONNECT_REQUEST; + cb->child_cm_id = cma_id; + DEBUG_LOG("child cma %p\n", cb->child_cm_id); wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: - DEBUG_LOG(cb, "ESTABLISHED\n"); + DEBUG_LOG("ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; } @@ -292,24 +286,24 @@ case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - PRINTF(cb, "cma event %d, error %d\n", event->event, + printk(KERN_ERR PFX "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: - PRINTF(cb, "DISCONNECT EVENT...\n"); + printk(KERN_ERR PFX "DISCONNECT EVENT...\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - PRINTF(cb, "cma detected device removal!!!!\n"); + printk(KERN_ERR PFX "cma detected device removal!!!!\n"); break; default: - PRINTF(cb, "oof bad type!\n"); + printk(KERN_ERR PFX "oof bad type!\n"); wake_up_interruptible(&cb->sem); break; } @@ -319,7 +313,7 @@ static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { - PRINTF(cb, "Received bogus data, size %d\n", + printk(KERN_ERR PFX "Received bogus data, size %d\n", wc->byte_len); return -1; } @@ -327,7 +321,7 @@ cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); - DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n", + DEBUG_LOG("Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); @@ -342,7 +336,7 @@ static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { - PRINTF(cb, "Received bogus data, size %d\n", + printk(KERN_ERR PFX "Received bogus data, size %d\n", wc->byte_len); return -1; } @@ -364,18 +358,22 @@ BUG_ON(cb->cq != cq); if (cb->state == ERROR) { - PRINTF(cb, "cq completion in ERROR state\n"); + printk(KERN_ERR PFX "cq completion in ERROR state\n"); + return; + } + if (cb->frtest) { + printk(KERN_ERR PFX "cq completion event in frtest!\n"); return; } - if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) + if (!cb->wlat && !cb->rlat && !cb->bw) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { - DEBUG_LOG(cb, "cq flushed\n"); + DEBUG_LOG("cq flushed\n"); continue; } else { - PRINTF(cb, "cq completion failed with " + printk(KERN_ERR PFX "cq completion failed with " "wr_id %jx status %d opcode %d vender_err %x\n", (uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; @@ -384,44 +382,44 @@ switch (wc.opcode) { case IB_WC_SEND: - DEBUG_LOG(cb, "send completion\n"); + DEBUG_LOG("send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: - DEBUG_LOG(cb, "rdma write completion\n"); - cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; + DEBUG_LOG("rdma write completion\n"); + cb->stats.write_bytes += cb->rdma_sq_wr.wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: - DEBUG_LOG(cb, "rdma read completion\n"); - cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; + DEBUG_LOG("rdma read completion\n"); + cb->stats.read_bytes += cb->rdma_sq_wr.wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: - DEBUG_LOG(cb, "recv completion\n"); + DEBUG_LOG("recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; - if (cb->wlat || cb->rlat || cb->bw || cb->frtest) + if (cb->wlat || cb->rlat || cb->bw) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { - PRINTF(cb, "recv wc error: %d\n", ret); + printk(KERN_ERR PFX "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post recv error: %d\n", + printk(KERN_ERR PFX "post recv error: %d\n", ret); goto error; } @@ -429,14 +427,14 @@ break; default: - PRINTF(cb, + printk(KERN_ERR PFX "%s:%d Unexpected opcode %d, Shutting down\n", __func__, __LINE__, wc.opcode); goto error; } } if (ret) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); goto error; } return; @@ -450,7 +448,7 @@ struct rdma_conn_param conn_param; int ret; - DEBUG_LOG(cb, "accepting client connection request\n"); + DEBUG_LOG("accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; @@ -458,14 +456,14 @@ ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { - PRINTF(cb, "rdma_accept error: %d\n", ret); + printk(KERN_ERR PFX "rdma_accept error: %d\n", ret); return ret; } - if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { + if (!cb->wlat && !cb->rlat && !cb->bw) { wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { - PRINTF(cb, "wait for CONNECTED state %d\n", + printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state); return -1; } @@ -477,278 +475,120 @@ { cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; - if (cb->local_dma_lkey) - cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey; - else if (cb->mem == DMA) - cb->recv_sgl.lkey = cb->dma_mr->lkey; - else - cb->recv_sgl.lkey = cb->recv_mr->lkey; + cb->recv_sgl.lkey = cb->pd->local_dma_lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; - if (cb->local_dma_lkey) - cb->send_sgl.lkey = cb->qp->device->local_dma_lkey; - else if (cb->mem == DMA) - cb->send_sgl.lkey = cb->dma_mr->lkey; - else - cb->send_sgl.lkey = cb->send_mr->lkey; + cb->send_sgl.lkey = cb->pd->local_dma_lkey; cb->sq_wr.opcode = IB_WR_SEND; cb->sq_wr.send_flags = IB_SEND_SIGNALED; cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; - if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { + if (cb->server || cb->wlat || cb->rlat || cb->bw) { cb->rdma_sgl.addr = cb->rdma_dma_addr; - if (cb->mem == MR) - cb->rdma_sgl.lkey = cb->rdma_mr->lkey; - cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; - cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; - cb->rdma_sq_wr.num_sge = 1; - } - - switch(cb->mem) { - case FASTREG: - - /* - * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR. - * both unsignaled. The client uses them to reregister - * the rdma buffers with a new key each iteration. - */ - cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR; - cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - cb->fastreg_wr.wr.fast_reg.length = cb->size; - cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list; - cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len; - - cb->invalidate_wr.next = &cb->fastreg_wr; - cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; - break; - case MW: - cb->bind_attr.wr_id = 0xabbaabba; - cb->bind_attr.send_flags = 0; /* unsignaled */ -#ifdef BIND_INFO - cb->bind_attr.bind_info.length = cb->size; -#else - cb->bind_attr.length = cb->size; -#endif - break; - default: - break; + cb->rdma_sq_wr.wr.send_flags = IB_SEND_SIGNALED; + cb->rdma_sq_wr.wr.sg_list = &cb->rdma_sgl; + cb->rdma_sq_wr.wr.num_sge = 1; } + + /* + * A chain of 2 WRs, INVALDATE_MR + REG_MR. + * both unsignaled. The client uses them to reregister + * the rdma buffers with a new key each iteration. + */ + cb->reg_mr_wr.wr.opcode = IB_WR_REG_MR; + cb->reg_mr_wr.mr = cb->reg_mr; + + cb->invalidate_wr.next = &cb->reg_mr_wr.wr; + cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; } static int krping_setup_buffers(struct krping_cb *cb) { int ret; - struct ib_phys_buf buf; - u64 iovbase; - DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb); + DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); - cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, + cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, &cb->recv_buf, sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); - cb->send_dma_addr = ib_dma_map_single(cb->pd->device, + cb->send_dma_addr = ib_dma_map_single(cb->pd->device, &cb->send_buf, sizeof(cb->send_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); - if (cb->mem == DMA) { - cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| - IB_ACCESS_REMOTE_READ| - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(cb->dma_mr)) { - DEBUG_LOG(cb, "reg_dmamr failed\n"); - ret = PTR_ERR(cb->dma_mr); - goto bail; - } - } else { - if (!cb->local_dma_lkey) { - buf.addr = cb->recv_dma_addr; - buf.size = sizeof cb->recv_buf; - DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n", - (uintmax_t)buf.addr, (int)buf.size); - iovbase = cb->recv_dma_addr; - cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - IB_ACCESS_LOCAL_WRITE, - &iovbase); - - if (IS_ERR(cb->recv_mr)) { - DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); - ret = PTR_ERR(cb->recv_mr); - goto bail; - } - - buf.addr = cb->send_dma_addr; - buf.size = sizeof cb->send_buf; - DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n", - (uintmax_t)buf.addr, (int)buf.size); - iovbase = cb->send_dma_addr; - cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - 0, &iovbase); - - if (IS_ERR(cb->send_mr)) { - DEBUG_LOG(cb, "send_buf reg_mr failed\n"); - ret = PTR_ERR(cb->send_mr); - goto bail; - } - } - } - - cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL); + cb->rdma_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size, + &cb->rdma_dma_addr, + GFP_KERNEL); if (!cb->rdma_buf) { - DEBUG_LOG(cb, "rdma_buf malloc failed\n"); + DEBUG_LOG(PFX "rdma_buf allocation failed\n"); ret = -ENOMEM; goto bail; } - - cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device, - cb->rdma_buf, cb->size, - DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); - if (cb->mem != DMA) { - switch (cb->mem) { - case FASTREG: - cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + - PAGE_SIZE) >> PAGE_SHIFT; - cb->page_list = ib_alloc_fast_reg_page_list( - cb->pd->device, - cb->page_list_len); - if (IS_ERR(cb->page_list)) { - DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); - ret = PTR_ERR(cb->page_list); - goto bail; - } - cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, - cb->page_list->max_page_list_len); - if (IS_ERR(cb->fastreg_mr)) { - DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); - ret = PTR_ERR(cb->fastreg_mr); - goto bail; - } - DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p" - " page_list_len %u\n", cb->fastreg_mr->rkey, - cb->page_list, cb->page_list_len); - break; - case MW: - cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1); - if (IS_ERR(cb->mw)) { - DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); - ret = PTR_ERR(cb->mw); - goto bail; - } - DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey); - /*FALLTHROUGH*/ - case MR: - buf.addr = cb->rdma_dma_addr; - buf.size = cb->size; - iovbase = cb->rdma_dma_addr; - cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - IB_ACCESS_LOCAL_WRITE| - IB_ACCESS_REMOTE_READ| - IB_ACCESS_REMOTE_WRITE, - &iovbase); - if (IS_ERR(cb->rdma_mr)) { - DEBUG_LOG(cb, "rdma_buf reg_mr failed\n"); - ret = PTR_ERR(cb->rdma_mr); - goto bail; - } - DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n", - (uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey); - break; - default: - ret = -EINVAL; - goto bail; - break; - } + cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) + >> PAGE_SHIFT; + cb->reg_mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, + cb->page_list_len); + if (IS_ERR(cb->reg_mr)) { + ret = PTR_ERR(cb->reg_mr); + DEBUG_LOG(PFX "recv_buf reg_mr failed %d\n", ret); + goto bail; } + DEBUG_LOG(PFX "reg rkey 0x%x page_list_len %u\n", + cb->reg_mr->rkey, cb->page_list_len); - if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { + if (!cb->server || cb->wlat || cb->rlat || cb->bw) { - cb->start_buf = kmalloc(cb->size, GFP_KERNEL); + cb->start_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size, + &cb->start_dma_addr, + GFP_KERNEL); if (!cb->start_buf) { - DEBUG_LOG(cb, "start_buf malloc failed\n"); + DEBUG_LOG(PFX "start_buf malloc failed\n"); ret = -ENOMEM; goto bail; } - - cb->start_dma_addr = ib_dma_map_single(cb->pd->device, - cb->start_buf, cb->size, - DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); - - if (cb->mem == MR || cb->mem == MW) { - unsigned flags = IB_ACCESS_REMOTE_READ; - - if (cb->wlat || cb->rlat || cb->bw || cb->frtest) { - flags |= IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE; - } - - buf.addr = cb->start_dma_addr; - buf.size = cb->size; - DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n", - (uintmax_t)buf.addr, (int)buf.size); - iovbase = cb->start_dma_addr; - cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - flags, - &iovbase); - - if (IS_ERR(cb->start_mr)) { - DEBUG_LOG(cb, "start_buf reg_mr failed\n"); - ret = PTR_ERR(cb->start_mr); - goto bail; - } - } } krping_setup_wr(cb); - DEBUG_LOG(cb, "allocated & registered buffers...\n"); + DEBUG_LOG(PFX "allocated & registered buffers...\n"); return 0; bail: - if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr)) - ib_dereg_mr(cb->fastreg_mr); - if (cb->mw && !IS_ERR(cb->mw)) - ib_dealloc_mw(cb->mw); + if (cb->reg_mr && !IS_ERR(cb->reg_mr)) + ib_dereg_mr(cb->reg_mr); if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); - if (cb->page_list && !IS_ERR(cb->page_list)) - ib_free_fast_reg_page_list(cb->page_list); if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); - if (cb->recv_mr && !IS_ERR(cb->recv_mr)) - ib_dereg_mr(cb->recv_mr); - if (cb->send_mr && !IS_ERR(cb->send_mr)) - ib_dereg_mr(cb->send_mr); - if (cb->rdma_buf) - kfree(cb->rdma_buf); - if (cb->start_buf) - kfree(cb->start_buf); + if (cb->rdma_buf) { + ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf, + cb->rdma_dma_addr); + } + if (cb->start_buf) { + ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf, + cb->start_dma_addr); + } return ret; } static void krping_free_buffers(struct krping_cb *cb) { - DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb); + DEBUG_LOG("krping_free_buffers called on cb %p\n", cb); if (cb->dma_mr) ib_dereg_mr(cb->dma_mr); - if (cb->send_mr) - ib_dereg_mr(cb->send_mr); - if (cb->recv_mr) - ib_dereg_mr(cb->recv_mr); if (cb->rdma_mr) ib_dereg_mr(cb->rdma_mr); if (cb->start_mr) ib_dereg_mr(cb->start_mr); - if (cb->fastreg_mr) - ib_dereg_mr(cb->fastreg_mr); - if (cb->mw) - ib_dealloc_mw(cb->mw); + if (cb->reg_mr) + ib_dereg_mr(cb->reg_mr); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), @@ -756,15 +596,13 @@ dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, send_mapping), sizeof(cb->send_buf), DMA_BIDIRECTIONAL); - dma_unmap_single(cb->pd->device->dma_device, - pci_unmap_addr(cb, rdma_mapping), - cb->size, DMA_BIDIRECTIONAL); - kfree(cb->rdma_buf); + + ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf, + cb->rdma_dma_addr); + if (cb->start_buf) { - dma_unmap_single(cb->pd->device->dma_device, - pci_unmap_addr(cb, start_mapping), - cb->size, DMA_BIDIRECTIONAL); - kfree(cb->start_buf); + ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf, + cb->start_dma_addr); } } @@ -776,6 +614,11 @@ memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = cb->txdepth; init_attr.cap.max_recv_wr = 2; + + /* For flush_qp() */ + init_attr.cap.max_send_wr++; + init_attr.cap.max_recv_wr++; + init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IB_QPT_RC; @@ -806,38 +649,42 @@ static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) { int ret; - cb->pd = ib_alloc_pd(cm_id->device); + struct ib_cq_init_attr attr = {0}; + + cb->pd = ib_alloc_pd(cm_id->device, 0); if (IS_ERR(cb->pd)) { - PRINTF(cb, "ib_alloc_pd failed\n"); + printk(KERN_ERR PFX "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } - DEBUG_LOG(cb, "created pd %p\n", cb->pd); + DEBUG_LOG("created pd %p\n", cb->pd); strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); + attr.cqe = cb->txdepth * 2; + attr.comp_vector = 0; cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, - cb, cb->txdepth * 2, 0); + cb, &attr); if (IS_ERR(cb->cq)) { - PRINTF(cb, "ib_create_cq failed\n"); + printk(KERN_ERR PFX "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } - DEBUG_LOG(cb, "created cq %p\n", cb->cq); + DEBUG_LOG("created cq %p\n", cb->cq); if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); if (ret) { - PRINTF(cb, "ib_create_cq failed\n"); + printk(KERN_ERR PFX "ib_create_cq failed\n"); goto err2; } } ret = krping_create_qp(cb); if (ret) { - PRINTF(cb, "krping_create_qp failed: %d\n", ret); + printk(KERN_ERR PFX "krping_create_qp failed: %d\n", ret); goto err2; } - DEBUG_LOG(cb, "created qp %p\n", cb->qp); + DEBUG_LOG("created qp %p\n", cb->qp); return 0; err2: ib_destroy_cq(cb->cq); @@ -848,115 +695,54 @@ /* * return the (possibly rebound) rkey for the rdma buffer. - * FASTREG mode: invalidate and rebind via fastreg wr. - * MW mode: rebind the MW. + * REG mode: invalidate and rebind via reg wr. * other modes: just return the mr rkey. */ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) { - u32 rkey = 0xffffffff; - u64 p; + u32 rkey; struct ib_send_wr *bad_wr; - int i; int ret; + struct scatterlist sg = {0}; - switch (cb->mem) { - case FASTREG: - cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey; - - /* - * Update the fastreg key. - */ - ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key); - cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey; - - /* - * Update the fastreg WR with new buf info. - */ - if (buf == (u64)cb->start_dma_addr) - cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ; - else - cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - cb->fastreg_wr.wr.fast_reg.iova_start = buf; - p = (u64)(buf & PAGE_MASK); - for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; - i++, p += PAGE_SIZE) { - cb->page_list->page_list[i] = p; - DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p); - } + cb->invalidate_wr.ex.invalidate_rkey = cb->reg_mr->rkey; - DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u" - " iova_start %jx page_list_len %u\n", - post_inv, - cb->fastreg_wr.wr.fast_reg.rkey, - cb->fastreg_wr.wr.fast_reg.page_shift, - (unsigned)cb->fastreg_wr.wr.fast_reg.length, - (uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start, - cb->fastreg_wr.wr.fast_reg.page_list_len); - - if (post_inv) - ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); - else - ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - cb->state = ERROR; - } - rkey = cb->fastreg_mr->rkey; - break; - case MW: - /* - * Update the MW with new buf info. - */ - if (buf == (u64)cb->start_dma_addr) { -#ifdef BIND_INFO - cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ; - cb->bind_attr.bind_info.mr = cb->start_mr; -#else - cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ; - cb->bind_attr.mr = cb->start_mr; -#endif - } else { -#ifdef BIND_INFO - cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE; - cb->bind_attr.bind_info.mr = cb->rdma_mr; -#else - cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE; - cb->bind_attr.mr = cb->rdma_mr; -#endif - } -#ifdef BIND_INFO - cb->bind_attr.bind_info.addr = buf; -#else - cb->bind_attr.addr = buf; -#endif - DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n", -#ifdef BIND_INFO - cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey); -#else - cb->mw->rkey, buf, cb->bind_attr.mr->rkey); -#endif - ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr); - if (ret) { - PRINTF(cb, "bind mw error %d\n", ret); - cb->state = ERROR; - } else - rkey = cb->mw->rkey; - break; - case MR: - if (buf == (u64)cb->start_dma_addr) - rkey = cb->start_mr->rkey; - else - rkey = cb->rdma_mr->rkey; - break; - case DMA: - rkey = cb->dma_mr->rkey; - break; - default: - PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__); + /* + * Update the reg key. + */ + ib_update_fast_reg_key(cb->reg_mr, ++cb->key); + cb->reg_mr_wr.key = cb->reg_mr->rkey; + + /* + * Update the reg WR with new buf info. + */ + if (buf == (u64)cb->start_dma_addr) + cb->reg_mr_wr.access = IB_ACCESS_REMOTE_READ; + else + cb->reg_mr_wr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; + sg_dma_address(&sg) = buf; + sg_dma_len(&sg) = cb->size; + + ret = ib_map_mr_sg(cb->reg_mr, &sg, 1, NULL, PAGE_SIZE); + BUG_ON(ret <= 0 || ret > cb->page_list_len); + + DEBUG_LOG(PFX "post_inv = %d, reg_mr new rkey 0x%x pgsz %u len %u" + " iova_start %llx\n", + post_inv, + cb->reg_mr_wr.key, + cb->reg_mr->page_size, + cb->reg_mr->length, + (unsigned long long)cb->reg_mr->iova); + + if (post_inv) + ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); + else + ret = ib_post_send(cb->qp, &cb->reg_mr_wr.wr, &bad_wr); + if (ret) { + printk(KERN_ERR PFX "post send error %d\n", ret); cb->state = ERROR; - break; } + rkey = cb->reg_mr->rkey; return rkey; } @@ -966,16 +752,16 @@ u32 rkey; /* - * Client side will do fastreg or mw bind before + * Client side will do reg or mw bind before * advertising the rdma buffer. Server side * sends have no data. */ - if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { + if (!cb->server || cb->wlat || cb->rlat || cb->bw) { rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate); info->buf = htonll(buf); info->rkey = htonl(rkey); info->size = htonl(cb->size); - DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n", + DEBUG_LOG("RDMA addr %llx rkey %x len %d\n", (unsigned long long)buf, rkey, cb->size); } } @@ -989,111 +775,102 @@ /* Wait for client's Start STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV); if (cb->state != RDMA_READ_ADV) { - PRINTF(cb, "wait for RDMA_READ_ADV state %d\n", + printk(KERN_ERR PFX "wait for RDMA_READ_ADV state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server received sink adv\n"); + DEBUG_LOG("server received sink adv\n"); - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->remote_len; - cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1); + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->remote_len; + cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, !cb->read_inv); + cb->rdma_sq_wr.wr.next = NULL; /* Issue RDMA Read. */ if (cb->read_inv) - cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; else { - cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; - if (cb->mem == FASTREG) { - /* - * Immediately follow the read with a - * fenced LOCAL_INV. - */ - cb->rdma_sq_wr.next = &inv; - memset(&inv, 0, sizeof inv); - inv.opcode = IB_WR_LOCAL_INV; - inv.ex.invalidate_rkey = cb->fastreg_mr->rkey; - inv.send_flags = IB_SEND_FENCE; - } + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ; + /* + * Immediately follow the read with a + * fenced LOCAL_INV. + */ + cb->rdma_sq_wr.wr.next = &inv; + memset(&inv, 0, sizeof inv); + inv.opcode = IB_WR_LOCAL_INV; + inv.ex.invalidate_rkey = cb->reg_mr->rkey; + inv.send_flags = IB_SEND_FENCE; } - ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } - cb->rdma_sq_wr.next = NULL; + cb->rdma_sq_wr.wr.next = NULL; - DEBUG_LOG(cb, "server posted rdma read req \n"); + DEBUG_LOG("server posted rdma read req \n"); /* Wait for read completion */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_COMPLETE); if (cb->state != RDMA_READ_COMPLETE) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_READ_COMPLETE state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server received read complete\n"); + DEBUG_LOG("server received read complete\n"); /* Display data in recv buf */ - if (cb->verbose) { - if (strlen(cb->rdma_buf) > 128) { - char msgbuf[128]; - - strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); - PRINTF(cb, "server ping data stripped: %s\n", - msgbuf); - } else - PRINTF(cb, "server ping data: %s\n", - cb->rdma_buf); - } + if (cb->verbose) + printk(KERN_INFO PFX "server ping data: %s\n", + cb->rdma_buf); /* Tell client to continue */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; - DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); + DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } - DEBUG_LOG(cb, "server posted go ahead\n"); + DEBUG_LOG("server posted go ahead\n"); /* Wait for client's RDMA STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server received sink adv\n"); + DEBUG_LOG("server received sink adv\n"); /* RDMA Write echo data */ - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = strlen(cb->rdma_buf) + 1; if (cb->local_dma_lkey) - cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey; + cb->rdma_sgl.lkey = cb->pd->local_dma_lkey; else cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0); - DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n", - cb->rdma_sq_wr.sg_list->lkey, - (unsigned long long)cb->rdma_sq_wr.sg_list->addr, - cb->rdma_sq_wr.sg_list->length); + DEBUG_LOG("rdma write from lkey %x laddr %llx len %d\n", + cb->rdma_sq_wr.wr.sg_list->lkey, + (unsigned long long)cb->rdma_sq_wr.wr.sg_list->addr, + cb->rdma_sq_wr.wr.sg_list->length); - ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } @@ -1101,12 +878,12 @@ ret = wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { - PRINTF(cb, + printk(KERN_ERR PFX "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } - DEBUG_LOG(cb, "server rdma write complete \n"); + DEBUG_LOG("server rdma write complete \n"); cb->state = CONNECTED; @@ -1114,14 +891,14 @@ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; - DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); + DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); break; } - DEBUG_LOG(cb, "server posted go ahead\n"); + DEBUG_LOG("server posted go ahead\n"); } } @@ -1136,10 +913,10 @@ int ne; scnt = 0; - cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->size; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->size; microtime(&start_tv); if (!cb->poll) { @@ -1149,9 +926,9 @@ while (scnt < iters) { cb->state = RDMA_READ_ADV; - ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr); if (ret) { - PRINTF(cb, + printk(KERN_ERR PFX "Couldn't post send: ret=%d scnt %d\n", ret, scnt); return; @@ -1171,7 +948,7 @@ } else ne = ib_poll_cq(cb->cq, 1, &wc); if (cb->state == ERROR) { - PRINTF(cb, + printk(KERN_ERR PFX "state == ERROR...bailing scnt %d\n", scnt); return; @@ -1179,13 +956,13 @@ } while (ne == 0); if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (cb->poll && wc.status != IB_WC_SUCCESS) { - PRINTF(cb, "Completion wth error at %s:\n", + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, "Failed status %d: wr_id %d\n", + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } @@ -1198,7 +975,7 @@ stop_tv.tv_sec -= 1; } - PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n", + printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size); @@ -1224,34 +1001,34 @@ post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->size; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; @@ -1263,7 +1040,7 @@ ++rcnt; while (*poll_buf != (char)rcnt) { if (cb->state == ERROR) { - PRINTF(cb, + printk(KERN_ERR PFX "state = ERROR, bailing\n"); return; } @@ -1276,8 +1053,8 @@ *buf = (char)scnt+1; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); - if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { - PRINTF(cb, + if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { + printk(KERN_ERR PFX "Couldn't post send: scnt=%d\n", scnt); return; @@ -1304,17 +1081,17 @@ ++ccnt; if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { - PRINTF(cb, + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); - PRINTF(cb, + printk(KERN_ERR PFX "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return; @@ -1333,7 +1110,7 @@ sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } - PRINTF(cb, + printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), @@ -1366,34 +1143,34 @@ post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { - PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); + printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__); return; } - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = cb->size; + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; @@ -1405,8 +1182,8 @@ if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); - if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { - PRINTF(cb, + if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { + printk(KERN_ERR PFX "Couldn't post send: scnt=%d\n", scnt); return; @@ -1433,14 +1210,14 @@ ccnt += 1; if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { - PRINTF(cb, + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; @@ -1459,7 +1236,7 @@ sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } - PRINTF(cb, + printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), @@ -1489,20 +1266,21 @@ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); + printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } + wait_event_interruptible(cb->sem, cb->state == ERROR); } @@ -1521,18 +1299,18 @@ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); + printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } @@ -1555,18 +1333,18 @@ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); + printk(KERN_ERR PFX "send completiong error %d\n", wc.status); return; } @@ -1575,922 +1353,267 @@ wait_event_interruptible(cb->sem, cb->state == ERROR); } -static int fastreg_supported(struct krping_cb *cb, int server) +static int reg_supported(struct ib_device *dev) { - struct ib_device *dev = server?cb->child_cm_id->device: - cb->cm_id->device; - struct ib_device_attr attr; - int ret; + u64 needed_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; - ret = ib_query_device(dev, &attr); - if (ret) { - PRINTF(cb, "ib_query_device failed ret %d\n", ret); - return 0; - } - if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { - PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n", - (unsigned long long)attr.device_cap_flags); + if ((dev->attrs.device_cap_flags & needed_flags) != needed_flags) { + printk(KERN_ERR PFX + "Fastreg not supported - device_cap_flags 0x%llx\n", + (unsigned long long)dev->attrs.device_cap_flags); return 0; } - DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n", - (uintmax_t)attr.device_cap_flags); + DEBUG_LOG("Fastreg supported - device_cap_flags 0x%llx\n", + (unsigned long long)dev->attrs.device_cap_flags); return 1; } +static void fill_sockaddr(struct sockaddr_storage *sin, struct krping_cb *cb) +{ + memset(sin, 0, sizeof(*sin)); + + if (cb->addr_type == AF_INET) { + struct sockaddr_in *sin4 = (struct sockaddr_in *)sin; + sin4->sin_family = AF_INET; + memcpy((void *)&sin4->sin_addr.s_addr, cb->addr, 4); + sin4->sin_port = cb->port; + } else if (cb->addr_type == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; + sin6->sin6_family = AF_INET6; + memcpy((void *)&sin6->sin6_addr, cb->addr, 16); + sin6->sin6_port = cb->port; + } +} + static int krping_bind_server(struct krping_cb *cb) { - struct sockaddr_in sin; + struct sockaddr_storage sin; int ret; - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof sin; - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = cb->addr.s_addr; - sin.sin_port = cb->port; - ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); + fill_sockaddr(&sin, cb); + + ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *)&sin); if (ret) { - PRINTF(cb, "rdma_bind_addr error %d\n", ret); + printk(KERN_ERR PFX "rdma_bind_addr error %d\n", ret); return ret; } - DEBUG_LOG(cb, "rdma_bind_addr successful\n"); + DEBUG_LOG("rdma_bind_addr successful\n"); - DEBUG_LOG(cb, "rdma_listen\n"); + DEBUG_LOG("rdma_listen\n"); ret = rdma_listen(cb->cm_id, 3); if (ret) { - PRINTF(cb, "rdma_listen failed: %d\n", ret); + printk(KERN_ERR PFX "rdma_listen failed: %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST); if (cb->state != CONNECT_REQUEST) { - PRINTF(cb, "wait for CONNECT_REQUEST state %d\n", + printk(KERN_ERR PFX "wait for CONNECT_REQUEST state %d\n", cb->state); return -1; } - if (cb->mem == FASTREG && !fastreg_supported(cb, 1)) + if (!reg_supported(cb->child_cm_id->device)) return -EINVAL; return 0; } -/* - * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads - * complete. - * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. - */ -static void krping_fr_test5(struct krping_cb *cb) +static void krping_run_server(struct krping_cb *cb) { - struct ib_fast_reg_page_list **pl; - struct ib_send_wr *fr, *read, *bad; - struct ib_wc wc; - struct ib_sge *sgl; - u8 key = 0; - struct ib_mr **mr; - u8 **buf; - dma_addr_t *dma_addr; - int i; + struct ib_recv_wr *bad_wr; int ret; - int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - time_t start; - int count = 0; - int scnt; - int depth = cb->txdepth >> 1; - if (!depth) { - PRINTF(cb, "txdepth must be > 1 for this test!\n"); + ret = krping_bind_server(cb); + if (ret) return; + + ret = krping_setup_qp(cb, cb->child_cm_id); + if (ret) { + printk(KERN_ERR PFX "setup_qp failed: %d\n", ret); + goto err0; } - pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); - mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); - fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); - sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); - read = kzalloc(sizeof *read * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth); - buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); - dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); - if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) { - PRINTF(cb, "kzalloc failed\n"); + ret = krping_setup_buffers(cb); + if (ret) { + printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret); goto err1; } - for (scnt = 0; scnt < depth; scnt++) { - pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl[scnt])) { - PRINTF(cb, "alloc_fr_page_list failed %ld\n", - PTR_ERR(pl[scnt])); - goto err2; - } - DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); + ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); + goto err2; + } - mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr[scnt])) { - PRINTF(cb, "alloc_fr failed %ld\n", - PTR_ERR(mr[scnt])); - goto err2; + ret = krping_accept(cb); + if (ret) { + printk(KERN_ERR PFX "connect error %d\n", ret); + goto err2; + } + + if (cb->wlat) + krping_wlat_test_server(cb); + else if (cb->rlat) + krping_rlat_test_server(cb); + else if (cb->bw) + krping_bw_test_server(cb); + else + krping_test_server(cb); + rdma_disconnect(cb->child_cm_id); +err2: + krping_free_buffers(cb); +err1: + krping_free_qp(cb); +err0: + rdma_destroy_id(cb->child_cm_id); +} + +static void krping_test_client(struct krping_cb *cb) +{ + int ping, start, cc, i, ret; + struct ib_send_wr *bad_wr; + unsigned char c; + + start = 65; + for (ping = 0; !cb->count || ping < cb->count; ping++) { + cb->state = RDMA_READ_ADV; + + /* Put some ascii text in the buffer. */ + cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); + for (i = cc, c = start; i < cb->size; i++) { + cb->start_buf[i] = c; + c++; + if (c > 122) + c = 65; } - DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); - ib_update_fast_reg_key(mr[scnt], ++key); + start++; + if (start > 122) + start = 65; + cb->start_buf[cb->size - 1] = 0; - buf[scnt] = kmalloc(cb->size, GFP_KERNEL); - if (!buf[scnt]) { - PRINTF(cb, "kmalloc failed\n"); - ret = -ENOMEM; - goto err2; + krping_format_send(cb, cb->start_dma_addr); + if (cb->state == ERROR) { + printk(KERN_ERR PFX "krping_format_send failed\n"); + break; } - DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); - dma_addr[scnt] = ib_dma_map_single(cb->pd->device, - buf[scnt], cb->size, - DMA_BIDIRECTIONAL); - if (dma_mapping_error(cb->pd->device->dma_device, - dma_addr[scnt])) { - PRINTF(cb, "dma_map failed\n"); - ret = -ENOMEM; - goto err2; + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + printk(KERN_ERR PFX "post send error %d\n", ret); + break; } - DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); - for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); - DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", - __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); + + /* Wait for server to ACK */ + wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); + if (cb->state != RDMA_WRITE_ADV) { + printk(KERN_ERR PFX + "wait for RDMA_WRITE_ADV state %d\n", + cb->state); + break; } - sgl[scnt].lkey = mr[scnt]->rkey; - sgl[scnt].length = cb->size; - sgl[scnt].addr = (u64)buf[scnt]; - DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n", - __func__, scnt, sgl[scnt].lkey, sgl[scnt].length, - (uintmax_t)sgl[scnt].addr); - - fr[scnt].opcode = IB_WR_FAST_REG_MR; - fr[scnt].wr_id = scnt; - fr[scnt].send_flags = 0; - fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; - fr[scnt].wr.fast_reg.length = cb->size; - fr[scnt].wr.fast_reg.page_list = pl[scnt]; - fr[scnt].wr.fast_reg.page_list_len = plen; - fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; - fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; - fr[scnt].next = &read[scnt]; - read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV; - read[scnt].wr_id = scnt; - read[scnt].send_flags = IB_SEND_SIGNALED; - read[scnt].wr.rdma.rkey = cb->remote_rkey; - read[scnt].wr.rdma.remote_addr = cb->remote_addr; - read[scnt].num_sge = 1; - read[scnt].sg_list = &sgl[scnt]; - ret = ib_post_send(cb->qp, &fr[scnt], &bad); + krping_format_send(cb, cb->rdma_dma_addr); + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err2; + printk(KERN_ERR PFX "post send error %d\n", ret); + break; } - } - start = time_uptime; - DEBUG_LOG(cb, "%s starting IO.\n", __func__); - while (!cb->count || cb->server || count < cb->count) { - if ((time_uptime - start) >= 9) { - DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, - count); - wait_event_interruptible_timeout(cb->sem, - cb->state == ERROR, - 1); - if (cb->state == ERROR) - break; - start = time_uptime; - } - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", - ret); - goto err2; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, - "completion error %u wr_id %ju " - "opcode %d\n", wc.status, - (uintmax_t)wc.wr_id, wc.opcode); - goto err2; - } - count++; - if (count == cb->count) - break; - ib_update_fast_reg_key(mr[wc.wr_id], ++key); - fr[wc.wr_id].wr.fast_reg.rkey = - mr[wc.wr_id]->rkey; - sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey; - ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad); - if (ret) { - PRINTF(cb, - "ib_post_send failed %d\n", ret); - goto err2; - } - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err2; - } - } while (ret == 1); - } - DEBUG_LOG(cb, "%s done!\n", __func__); -err2: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); + /* Wait for the server to say the RDMA Write is complete. */ + wait_event_interruptible(cb->sem, + cb->state >= RDMA_WRITE_COMPLETE); + if (cb->state != RDMA_WRITE_COMPLETE) { + printk(KERN_ERR PFX + "wait for RDMA_WRITE_COMPLETE state %d\n", + cb->state); break; } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, "completion error %u " - "opcode %u\n", wc.status, wc.opcode); + + if (cb->validate) + if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { + printk(KERN_ERR PFX "data mismatch!\n"); + break; } - } - } while (ret == 1); - DEBUG_LOG(cb, "destroying fr mrs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (mr[scnt]) { - ib_dereg_mr(mr[scnt]); - DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); - } - } - DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (buf[scnt]) { - dma_unmap_single(cb->pd->device->dma_device, - dma_addr[scnt], cb->size, - DMA_BIDIRECTIONAL); - kfree(buf[scnt]); - DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); - } - } - DEBUG_LOG(cb, "destroying fr page lists!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (pl[scnt]) { - DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); - ib_free_fast_reg_page_list(pl[scnt]); - } + if (cb->verbose) + printk(KERN_INFO PFX "ping data: %s\n", cb->rdma_buf); +#ifdef SLOW_KRPING + wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); +#endif } -err1: - if (pl) - kfree(pl); - if (mr) - kfree(mr); - if (fr) - kfree(fr); - if (read) - kfree(read); - if (sgl) - kfree(sgl); - if (buf) - kfree(buf); - if (dma_addr) - kfree(dma_addr); -} -static void krping_fr_test_server(struct krping_cb *cb) -{ - DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); - wait_event_interruptible(cb->sem, cb->state == ERROR); } -static void krping_fr_test5_server(struct krping_cb *cb) +static void krping_rlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; - /* Spin waiting for client's Start STAG/TO/Len */ - while (cb->state < RDMA_READ_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, - cb->remote_rkey, (uintmax_t)cb->remote_addr); + cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); + if (cb->state == ERROR) { + printk(KERN_ERR PFX "krping_format_send failed\n"); + return; + } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); + printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } - if (cb->duplex) - krping_fr_test5(cb); - DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); - wait_event_interruptible(cb->sem, cb->state == ERROR); -} + /* Spin waiting for server's Start STAG/TO/Len */ + while (cb->state < RDMA_WRITE_ADV) { + krping_cq_event_handler(cb->cq, cb); + } -static void krping_fr_test5_client(struct krping_cb *cb) +#if 0 { - struct ib_send_wr *bad; + int i; + struct timeval start, stop; + time_t sec; + suseconds_t usec; + unsigned long long elapsed; struct ib_wc wc; - int ret; - - cb->state = RDMA_READ_ADV; - - /* Send STAG/TO/Len to server */ - krping_format_send(cb, cb->start_dma_addr); - if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); - return; - } - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); - return; - } - - /* Spin waiting for server's Start STAG/TO/Len */ - while (cb->state < RDMA_WRITE_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, - (uintmax_t)cb->remote_addr); - - return krping_fr_test5(cb); -} - -/* - * sq-depth worth of write + fastreg + inv, reposting them as the invs - * complete. - * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. - * If a count is given, then the last IO will have a bogus lkey in the - * write work request. This reproduces a fw bug where the connection - * will get stuck if a fastreg is processed while the ulptx is failing - * the bad write. - */ -static void krping_fr_test6(struct krping_cb *cb) -{ - struct ib_fast_reg_page_list **pl; - struct ib_send_wr *fr, *write, *inv, *bad; - struct ib_wc wc; - struct ib_sge *sgl; - u8 key = 0; - struct ib_mr **mr; - u8 **buf; - dma_addr_t *dma_addr; - int i; - int ret; - int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - unsigned long start; - int count = 0; - int scnt; - int depth = cb->txdepth / 3; - - if (!depth) { - PRINTF(cb, "txdepth must be > 3 for this test!\n"); - return; - } - - pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); - - mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); - - fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); - - sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); - - write = kzalloc(sizeof *write * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth); - - inv = kzalloc(sizeof *inv * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth); - - buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); - - dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); - DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); - - if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) { - PRINTF(cb, "kzalloc failed\n"); - goto err1; - } - - for (scnt = 0; scnt < depth; scnt++) { - pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl[scnt])) { - PRINTF(cb, "alloc_fr_page_list failed %ld\n", - PTR_ERR(pl[scnt])); - goto err2; - } - DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); - - mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr[scnt])) { - PRINTF(cb, "alloc_fr failed %ld\n", - PTR_ERR(mr[scnt])); - goto err2; - } - DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); - ib_update_fast_reg_key(mr[scnt], ++key); - - buf[scnt] = kmalloc(cb->size, GFP_KERNEL); - if (!buf[scnt]) { - PRINTF(cb, "kmalloc failed\n"); - ret = -ENOMEM; - goto err2; - } - DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); - dma_addr[scnt] = ib_dma_map_single(cb->pd->device, - buf[scnt], cb->size, - DMA_BIDIRECTIONAL); - if (dma_mapping_error(cb->pd->device->dma_device, - dma_addr[scnt])) { - PRINTF(cb, "dma_map failed\n"); - ret = -ENOMEM; - goto err2; - } - DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); - for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); - DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", - __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); - } - - write[scnt].opcode = IB_WR_RDMA_WRITE; - write[scnt].wr_id = scnt; - write[scnt].wr.rdma.rkey = cb->remote_rkey; - write[scnt].wr.rdma.remote_addr = cb->remote_addr; - write[scnt].num_sge = 1; - write[scnt].sg_list = &cb->rdma_sgl; - write[scnt].sg_list->length = cb->size; - write[scnt].next = &fr[scnt]; - - fr[scnt].opcode = IB_WR_FAST_REG_MR; - fr[scnt].wr_id = scnt; - fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; - fr[scnt].wr.fast_reg.length = cb->size; - fr[scnt].wr.fast_reg.page_list = pl[scnt]; - fr[scnt].wr.fast_reg.page_list_len = plen; - fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; - fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; - fr[scnt].next = &inv[scnt]; - - inv[scnt].opcode = IB_WR_LOCAL_INV; - inv[scnt].send_flags = IB_SEND_SIGNALED; - inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey; - - ret = ib_post_send(cb->qp, &write[scnt], &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err2; - } - } - - start = time_uptime; - DEBUG_LOG(cb, "%s starting IO.\n", __func__); - while (!cb->count || cb->server || count < cb->count) { - if ((time_uptime - start) >= 9) { - DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, - count); - wait_event_interruptible_timeout(cb->sem, - cb->state == ERROR, - 1); - if (cb->state == ERROR) - break; - start = time_uptime; - } - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", - ret); - goto err2; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, - "completion error %u wr_id %ju " - "opcode %d\n", wc.status, - (uintmax_t)wc.wr_id, wc.opcode); - goto err2; - } - count++; - if (count == (cb->count -1)) - cb->rdma_sgl.lkey = 0x00dead; - if (count == cb->count) - break; - ib_update_fast_reg_key(mr[wc.wr_id], ++key); - fr[wc.wr_id].wr.fast_reg.rkey = - mr[wc.wr_id]->rkey; - inv[wc.wr_id].ex.invalidate_rkey = - mr[wc.wr_id]->rkey; - ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad); - if (ret) { - PRINTF(cb, - "ib_post_send failed %d\n", ret); - goto err2; - } - } else if (krping_sigpending()){ - PRINTF(cb, "signal!\n"); - goto err2; - } - } while (ret == 1); - } - DEBUG_LOG(cb, "%s done!\n", __func__); -err2: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, "completion error %u " - "opcode %u\n", wc.status, wc.opcode); - } - } - } while (ret == 1); - - DEBUG_LOG(cb, "destroying fr mrs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (mr[scnt]) { - ib_dereg_mr(mr[scnt]); - DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); - } - } - DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (buf[scnt]) { - dma_unmap_single(cb->pd->device->dma_device, - dma_addr[scnt], cb->size, - DMA_BIDIRECTIONAL); - kfree(buf[scnt]); - DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); - } - } - DEBUG_LOG(cb, "destroying fr page lists!\n"); - for (scnt = 0; scnt < depth; scnt++) { - if (pl[scnt]) { - DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); - ib_free_fast_reg_page_list(pl[scnt]); - } - } -err1: - if (pl) - kfree(pl); - if (mr) - kfree(mr); - if (fr) - kfree(fr); - if (write) - kfree(write); - if (inv) - kfree(inv); - if (sgl) - kfree(sgl); - if (buf) - kfree(buf); - if (dma_addr) - kfree(dma_addr); -} - -static void krping_fr_test6_server(struct krping_cb *cb) -{ - struct ib_send_wr *bad_wr; - struct ib_wc wc; - int ret; - - /* Spin waiting for client's Start STAG/TO/Len */ - while (cb->state < RDMA_READ_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, - cb->remote_rkey, (uintmax_t)cb->remote_addr); - - /* Send STAG/TO/Len to client */ - krping_format_send(cb, cb->start_dma_addr); - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completiong error %d\n", wc.status); - return; - } - - if (cb->duplex) - krping_fr_test6(cb); - DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); - wait_event_interruptible(cb->sem, cb->state == ERROR); -} - -static void krping_fr_test6_client(struct krping_cb *cb) -{ - struct ib_send_wr *bad; - struct ib_wc wc; - int ret; - - cb->state = RDMA_READ_ADV; - - /* Send STAG/TO/Len to server */ - krping_format_send(cb, cb->start_dma_addr); - if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); - return; - } - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); - return; - } - - /* Spin waiting for server's Start STAG/TO/Len */ - while (cb->state < RDMA_WRITE_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, - (uintmax_t)cb->remote_addr); - - return krping_fr_test6(cb); -} - -static void krping_run_server(struct krping_cb *cb) -{ - struct ib_recv_wr *bad_wr; - int ret; - - ret = krping_bind_server(cb); - if (ret) - return; - - ret = krping_setup_qp(cb, cb->child_cm_id); - if (ret) { - PRINTF(cb, "setup_qp failed: %d\n", ret); - goto err0; - } - - ret = krping_setup_buffers(cb); - if (ret) { - PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); - goto err1; - } - - ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); - if (ret) { - PRINTF(cb, "ib_post_recv failed: %d\n", ret); - goto err2; - } - - ret = krping_accept(cb); - if (ret) { - PRINTF(cb, "connect error %d\n", ret); - goto err2; - } - - if (cb->wlat) - krping_wlat_test_server(cb); - else if (cb->rlat) - krping_rlat_test_server(cb); - else if (cb->bw) - krping_bw_test_server(cb); - else if (cb->frtest) { - switch (cb->testnum) { - case 1: - case 2: - case 3: - case 4: - krping_fr_test_server(cb); - break; - case 5: - krping_fr_test5_server(cb); - break; - case 6: - krping_fr_test6_server(cb); - break; - default: - PRINTF(cb, "unknown fr test %d\n", cb->testnum); - goto err2; - break; - } - } else - krping_test_server(cb); - rdma_disconnect(cb->child_cm_id); -err2: - krping_free_buffers(cb); -err1: - krping_free_qp(cb); -err0: - rdma_destroy_id(cb->child_cm_id); -} - -static void krping_test_client(struct krping_cb *cb) -{ - int ping, start, cc, i, ret; - struct ib_send_wr *bad_wr; - unsigned char c; - - start = 65; - for (ping = 0; !cb->count || ping < cb->count; ping++) { - cb->state = RDMA_READ_ADV; - - /* Put some ascii text in the buffer. */ - cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); - for (i = cc, c = start; i < cb->size; i++) { - cb->start_buf[i] = c; - c++; - if (c > 122) - c = 65; - } - start++; - if (start > 122) - start = 65; - cb->start_buf[cb->size - 1] = 0; - - krping_format_send(cb, cb->start_dma_addr); - if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); - break; - } - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - break; - } - - /* Wait for server to ACK */ - wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); - if (cb->state != RDMA_WRITE_ADV) { - PRINTF(cb, - "wait for RDMA_WRITE_ADV state %d\n", - cb->state); - break; - } - - krping_format_send(cb, cb->rdma_dma_addr); - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - break; - } - - /* Wait for the server to say the RDMA Write is complete. */ - wait_event_interruptible(cb->sem, - cb->state >= RDMA_WRITE_COMPLETE); - if (cb->state != RDMA_WRITE_COMPLETE) { - PRINTF(cb, - "wait for RDMA_WRITE_COMPLETE state %d\n", - cb->state); - break; - } - - if (cb->validate) - if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { - PRINTF(cb, "data mismatch!\n"); - break; - } - - if (cb->verbose) { - if (strlen(cb->rdma_buf) > 128) { - char msgbuf[128]; - - strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); - PRINTF(cb, "ping data stripped: %s\n", - msgbuf); - } else - PRINTF(cb, "ping data: %s\n", cb->rdma_buf); - } -#ifdef SLOW_KRPING - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); -#endif - } -} - -static void krping_rlat_test_client(struct krping_cb *cb) -{ - struct ib_send_wr *bad_wr; - struct ib_wc wc; - int ret; - - cb->state = RDMA_READ_ADV; - - /* Send STAG/TO/Len to client */ - krping_format_send(cb, cb->start_dma_addr); - if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); - return; - } - ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); - if (ret) { - PRINTF(cb, "post send error %d\n", ret); - return; - } - - /* Spin waiting for send completion */ - while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); - if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); - return; - } - if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); - return; - } - - /* Spin waiting for server's Start STAG/TO/Len */ - while (cb->state < RDMA_WRITE_ADV) { - krping_cq_event_handler(cb->cq, cb); - } - -#if 0 -{ - int i; - struct timeval start, stop; - time_t sec; - suseconds_t usec; - unsigned long long elapsed; - struct ib_wc wc; - struct ib_send_wr *bad_wr; - int ne; - - cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; - cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; - cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; - cb->rdma_sq_wr.sg_list->length = 0; - cb->rdma_sq_wr.num_sge = 0; + struct ib_send_wr *bad_wr; + int ne; + + cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.rkey = cb->remote_rkey; + cb->rdma_sq_wr.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.wr.sg_list->length = 0; + cb->rdma_sq_wr.wr.num_sge = 0; microtime(&start); for (i=0; i < 100000; i++) { - if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { - PRINTF(cb, "Couldn't post send\n"); + if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) { + printk(KERN_ERR PFX "Couldn't post send\n"); return; } do { ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ne < 0) { - PRINTF(cb, "poll CQ failed %d\n", ne); + printk(KERN_ERR PFX "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { - PRINTF(cb, "Completion wth error at %s:\n", + printk(KERN_ERR PFX "Completion wth error at %s:\n", cb->server ? "server" : "client"); - PRINTF(cb, "Failed status %d: wr_id %d\n", + printk(KERN_ERR PFX "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } @@ -2504,7 +1627,7 @@ sec = stop.tv_sec - start.tv_sec; usec = stop.tv_usec - start.tv_usec; elapsed = sec * 1000000 + usec; - PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed); + printk(KERN_ERR PFX "0B-write-lat iters 100000 usec %llu\n", elapsed); } #endif @@ -2522,23 +1645,23 @@ /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); + printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); + printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } @@ -2561,23 +1684,23 @@ /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { - PRINTF(cb, "krping_format_send failed\n"); + printk(KERN_ERR PFX "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { - PRINTF(cb, "post send error %d\n", ret); + printk(KERN_ERR PFX "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { - PRINTF(cb, "poll error %d\n", ret); + printk(KERN_ERR PFX "poll error %d\n", ret); return; } if (wc.status) { - PRINTF(cb, "send completion error %d\n", wc.status); + printk(KERN_ERR PFX "send completion error %d\n", wc.status); return; } @@ -2589,261 +1712,101 @@ bw_test(cb); } - /* - * fastreg 2 valid different mrs and verify the completions. + * Manual qp flush test */ -static void krping_fr_test1(struct krping_cb *cb) +static void flush_qp(struct krping_cb *cb) { - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, *bad; + struct ib_send_wr wr = { 0 }, *bad; + struct ib_recv_wr recv_wr = { 0 }, *recv_bad; struct ib_wc wc; - struct ib_mr *mr1, *mr2; - int i; int ret; - int size = cb->size; - int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - int count = 0; - - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); - return; - } - - mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr1)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; - } - mr2 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr2)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err2; - } + int flushed = 0; + int ccnt = 0; + rdma_disconnect(cb->cm_id); + DEBUG_LOG("disconnected!\n"); - for (i=0; ipage_list[i] = i * PAGE_SIZE; - - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr_id = 1; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.rkey = mr1->rkey; - DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); + wr.opcode = IB_WR_SEND; + wr.wr_id = 0xdeadbeefcafebabe; + ret = ib_post_send(cb->qp, &wr, &bad); if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - fr.wr.fast_reg.rkey = mr2->rkey; - DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err3; - } - if (ret == 1) { - DEBUG_LOG(cb, "completion status %u wr %s\n", - wc.status, wc.wr_id == 1 ? "fr" : "inv"); - count++; - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err3; - } - - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - } while (count != 2); -err3: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); - } - } while (ret == 1); - DEBUG_LOG(cb, "destroying fr mr2!\n"); - - ib_dereg_mr(mr2); -err2: - DEBUG_LOG(cb, "destroying fr mr1!\n"); - ib_dereg_mr(mr1); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); -} - -/* - * fastreg the same mr twice, 2nd one should produce error cqe. - */ -static void krping_fr_test2(struct krping_cb *cb) -{ - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, *bad; - struct ib_wc wc; - struct ib_mr *mr1; - int i; - int ret; - int size = cb->size; - int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - int count = 0; - - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); + printk(KERN_ERR PFX "%s post_send failed ret %d\n", __func__, ret); return; } - mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr1)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; - } - - for (i=0; ipage_list[i] = i * PAGE_SIZE; - - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr_id = 1; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.rkey = mr1->rkey; - DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); + recv_wr.wr_id = 0xcafebabedeadbeef; + ret = ib_post_recv(cb->qp, &recv_wr, &recv_bad); if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; + printk(KERN_ERR PFX "%s post_recv failed ret %d\n", __func__, ret); + return; } - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err3; - } - if (ret == 1) { - DEBUG_LOG(cb, "completion status %u wr %s\n", - wc.status, wc.wr_id == 1 ? "fr" : "inv"); - count++; - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err3; - } - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - } while (count != 2); -err3: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); + /* poll until the flush WRs complete */ do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; - } - if (ret == 1) { - PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); + printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret); + return; } - } while (ret == 1); - DEBUG_LOG(cb, "destroying fr mr1!\n"); - ib_dereg_mr(mr1); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); + if (ret == 0) + continue; + ccnt++; + if (wc.wr_id == 0xdeadbeefcafebabe || + wc.wr_id == 0xcafebabedeadbeef) + flushed++; + } while (flushed != 2); + DEBUG_LOG("qp_flushed! ccnt %u\n", ccnt); } -/* - * fastreg pipelined in a loop as fast as we can until the user interrupts. - * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. - */ -static void krping_fr_test3(struct krping_cb *cb) +static void krping_fr_test(struct krping_cb *cb) { - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, inv, *bad; + struct ib_send_wr inv, *bad; + struct ib_reg_wr fr; struct ib_wc wc; u8 key = 0; struct ib_mr *mr; - int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt = 0; + struct scatterlist sg = {0}; - - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); + mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, plen); + if (IS_ERR(mr)) { + printk(KERN_ERR PFX "ib_alloc_mr failed %ld\n", PTR_ERR(mr)); return; } - - mr = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; + + sg_dma_address(&sg) = 0xcafebabe0000UL; + sg_dma_len(&sg) = size; + ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE); + if (ret <= 0) { + printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret); + goto err2; } - for (i=0; ipage_list[i] = i * PAGE_SIZE; - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.next = &inv; + fr.wr.opcode = IB_WR_REG_MR; + fr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; + fr.mr = mr; + fr.wr.next = &inv; + memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.send_flags = IB_SEND_SIGNALED; - DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); + DEBUG_LOG("fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); start = time_uptime; - while (1) { + while (!cb->count || count <= cb->count) { + if (SIGPENDING(curthread)) { + printk(KERN_ERR PFX "signal!\n"); + break; + } if ((time_uptime - start) >= 9) { - DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); + DEBUG_LOG("fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); if (cb->state == ERROR) break; @@ -2851,183 +1814,44 @@ } while (scnt < (cb->txdepth>>1)) { ib_update_fast_reg_key(mr, ++key); - fr.wr.fast_reg.rkey = mr->rkey; + fr.key = mr->rkey; inv.ex.invalidate_rkey = mr->rkey; + size = arc4random() % cb->size; if (size == 0) size = cb->size; - plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list_len = plen; - ret = ib_post_send(cb->qp, &fr, &bad); + sg_dma_len(&sg) = size; + ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE); + if (ret <= 0) { + printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret); + goto err2; + } + ret = ib_post_send(cb->qp, &fr.wr, &bad); if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); + printk(KERN_ERR PFX "ib_post_send failed %d\n", ret); goto err2; } - scnt+=2; + scnt++; } - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err2; - } - if (ret == 1) { - if (wc.status) { - PRINTF(cb, "completion error %u\n", wc.status); - goto err2; - } - count++; - scnt--; - } - else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err2; - } - } while (ret == 1); - } -err2: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; + printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret); + goto err2; } if (ret == 1) { if (wc.status) { - PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode); + printk(KERN_ERR PFX "completion error %u\n", wc.status); + goto err2; } - } - } while (ret == 1); - DEBUG_LOG(cb, "fr_test: done!\n"); - ib_dereg_mr(mr); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); -} - -/* - * fastreg 1 and invalidate 1 mr and verify completion. - */ -static void krping_fr_test4(struct krping_cb *cb) -{ - struct ib_fast_reg_page_list *pl; - struct ib_send_wr fr, inv, *bad; - struct ib_wc wc; - struct ib_mr *mr1; - int i; - int ret; - int size = cb->size; - int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; - int count = 0; - - pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); - if (IS_ERR(pl)) { - PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); - return; - } - - mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); - if (IS_ERR(mr1)) { - PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); - goto err1; - } - - for (i=0; ipage_list[i] = i * PAGE_SIZE; - - memset(&fr, 0, sizeof fr); - fr.opcode = IB_WR_FAST_REG_MR; - fr.wr_id = 1; - fr.wr.fast_reg.page_shift = PAGE_SHIFT; - fr.wr.fast_reg.length = size; - fr.wr.fast_reg.page_list = pl; - fr.wr.fast_reg.page_list_len = plen; - fr.wr.fast_reg.iova_start = 0; - fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; - fr.send_flags = IB_SEND_SIGNALED; - fr.wr.fast_reg.rkey = mr1->rkey; - fr.next = &inv; - memset(&inv, 0, sizeof inv); - inv.opcode = IB_WR_LOCAL_INV; - inv.ex.invalidate_rkey = mr1->rkey; - - DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); - ret = ib_post_send(cb->qp, &fr, &bad); - if (ret) { - PRINTF(cb, "ib_post_send failed %d\n", ret); - goto err3; - } - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - goto err3; - } - if (ret == 1) { - DEBUG_LOG(cb, "completion status %u wr %s\n", - wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; - } else if (krping_sigpending()) { - PRINTF(cb, "signal!\n"); - goto err3; - } - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - } while (count != 1); -err3: - DEBUG_LOG(cb, "sleeping 1 second\n"); - wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); - DEBUG_LOG(cb, "draining the cq...\n"); - do { - ret = ib_poll_cq(cb->cq, 1, &wc); - if (ret < 0) { - PRINTF(cb, "ib_poll_cq failed %d\n", ret); - break; + scnt--; } - if (ret == 1) { - PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); - } - } while (ret == 1); - DEBUG_LOG(cb, "destroying fr mr1!\n"); - ib_dereg_mr(mr1); -err1: - DEBUG_LOG(cb, "destroying fr page list!\n"); - ib_free_fast_reg_page_list(pl); - DEBUG_LOG(cb, "%s done!\n", __func__); -} - -static void krping_fr_test(struct krping_cb *cb) -{ - switch (cb->testnum) { - case 1: - krping_fr_test1(cb); - break; - case 2: - krping_fr_test2(cb); - break; - case 3: - krping_fr_test3(cb); - break; - case 4: - krping_fr_test4(cb); - break; - case 5: - krping_fr_test5_client(cb); - break; - case 6: - krping_fr_test6_client(cb); - break; - default: - PRINTF(cb, "Unkown frtest num %u\n", cb->testnum); - break; } +err2: + flush_qp(cb); + DEBUG_LOG("fr_test: done!\n"); + ib_dereg_mr(mr); } static int krping_connect_client(struct krping_cb *cb) @@ -3042,50 +1866,45 @@ ret = rdma_connect(cb->cm_id, &conn_param); if (ret) { - PRINTF(cb, "rdma_connect error %d\n", ret); + printk(KERN_ERR PFX "rdma_connect error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { - PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); + printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state); return -1; } - DEBUG_LOG(cb, "rdma_connect successful\n"); + DEBUG_LOG("rdma_connect successful\n"); return 0; } static int krping_bind_client(struct krping_cb *cb) { - struct sockaddr_in sin; + struct sockaddr_storage sin; int ret; - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof sin; - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = cb->addr.s_addr; - sin.sin_port = cb->port; + fill_sockaddr(&sin, cb); - ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, - 2000); + ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *)&sin, 2000); if (ret) { - PRINTF(cb, "rdma_resolve_addr error %d\n", ret); + printk(KERN_ERR PFX "rdma_resolve_addr error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED); if (cb->state != ROUTE_RESOLVED) { - PRINTF(cb, + printk(KERN_ERR PFX "addr/route resolution did not resolve: state %d\n", cb->state); return -EINTR; } - if (cb->mem == FASTREG && !fastreg_supported(cb, 0)) + if (!reg_supported(cb->cm_id->device)) return -EINVAL; - DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n"); + DEBUG_LOG("rdma_resolve_addr - rdma_resolve_route successful\n"); return 0; } @@ -3100,25 +1919,25 @@ ret = krping_setup_qp(cb, cb->cm_id); if (ret) { - PRINTF(cb, "setup_qp failed: %d\n", ret); + printk(KERN_ERR PFX "setup_qp failed: %d\n", ret); return; } ret = krping_setup_buffers(cb); if (ret) { - PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); + printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { - PRINTF(cb, "ib_post_recv failed: %d\n", ret); + printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_connect_client(cb); if (ret) { - PRINTF(cb, "connect error %d\n", ret); + printk(KERN_ERR PFX "connect error %d\n", ret); goto err2; } @@ -3139,7 +1958,7 @@ krping_free_qp(cb); } -int krping_doit(char *cmd, void *cookie) +int krping_doit(char *cmd) { struct krping_cb *cb; int op; @@ -3155,69 +1974,76 @@ list_add_tail(&cb->list, &krping_cbs); mutex_unlock(&krping_mutex); - cb->cookie = cookie; cb->server = -1; cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; - cb->mem = DMA; init_waitqueue_head(&cb->sem); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, &optint)) != 0) { switch (op) { + struct in_addr in_addr; case 'a': cb->addr_str = optarg; - DEBUG_LOG(cb, "ipaddr (%s)\n", optarg); - if (!inet_aton(optarg, &cb->addr)) { - PRINTF(cb, "bad addr string %s\n", + cb->addr_type = AF_INET; + DEBUG_LOG("ipaddr (%s)\n", optarg); + if (!inet_aton(optarg, &in_addr)) { + printk(KERN_ERR PFX "bad addr string %s\n", optarg); ret = EINVAL; } + memcpy(cb->addr, &in_addr.s_addr, sizeof(in_addr.s_addr)); + break; + case 'A': + cb->addr_str = optarg; + cb->addr_type = AF_INET6; + DEBUG_LOG("ipv6addr (%s)\n", optarg); + ret = EAFNOSUPPORT; /* XXX not supported */ break; case 'p': cb->port = htons(optint); - DEBUG_LOG(cb, "port %d\n", (int)optint); + DEBUG_LOG("port %d\n", (int)optint); break; case 'P': cb->poll = 1; - DEBUG_LOG(cb, "server\n"); + DEBUG_LOG("server\n"); break; case 's': cb->server = 1; - DEBUG_LOG(cb, "server\n"); + DEBUG_LOG("server\n"); break; case 'c': cb->server = 0; - DEBUG_LOG(cb, "client\n"); + DEBUG_LOG("client\n"); break; case 'S': cb->size = optint; if ((cb->size < 1) || (cb->size > RPING_BUFSIZE)) { - PRINTF(cb, "Invalid size %d " + printk(KERN_ERR PFX "Invalid size %d " "(valid range is 1 to %d)\n", cb->size, RPING_BUFSIZE); ret = EINVAL; } else - DEBUG_LOG(cb, "size %d\n", (int)optint); + DEBUG_LOG("size %d\n", (int)optint); break; case 'C': cb->count = optint; if (cb->count < 0) { - PRINTF(cb, "Invalid count %d\n", + printk(KERN_ERR PFX "Invalid count %d\n", cb->count); ret = EINVAL; } else - DEBUG_LOG(cb, "count %d\n", (int) cb->count); + DEBUG_LOG("count %d\n", (int) cb->count); break; case 'v': cb->verbose++; - DEBUG_LOG(cb, "verbose\n"); + DEBUG_LOG("verbose\n"); break; case 'V': cb->validate++; - DEBUG_LOG(cb, "validate data\n"); + DEBUG_LOG("validate data\n"); break; case 'l': cb->wlat++; @@ -3231,45 +2057,27 @@ case 'd': cb->duplex++; break; - case 'm': - if (!strncmp(optarg, "dma", 3)) - cb->mem = DMA; - else if (!strncmp(optarg, "fastreg", 7)) - cb->mem = FASTREG; - else if (!strncmp(optarg, "mw", 2)) - cb->mem = MW; - else if (!strncmp(optarg, "mr", 2)) - cb->mem = MR; - else { - PRINTF(cb, "unknown mem mode %s. " - "Must be dma, fastreg, mw, or mr\n", - optarg); - ret = -EINVAL; - break; - } - break; case 'I': cb->server_invalidate = 1; break; case 'T': cb->txdepth = optint; - DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth); + DEBUG_LOG("txdepth %d\n", (int) cb->txdepth); break; case 'Z': cb->local_dma_lkey = 1; - DEBUG_LOG(cb, "using local dma lkey\n"); + DEBUG_LOG("using local dma lkey\n"); break; case 'R': cb->read_inv = 1; - DEBUG_LOG(cb, "using read-with-inv\n"); + DEBUG_LOG("using read-with-inv\n"); break; case 'f': cb->frtest = 1; - cb->testnum = optint; - DEBUG_LOG(cb, "fast-reg test!\n"); + DEBUG_LOG("fast-reg test!\n"); break; default: - PRINTF(cb, "unknown opt %s\n", optarg); + printk(KERN_ERR PFX "unknown opt %s\n", optarg); ret = -EINVAL; break; } @@ -3278,48 +2086,43 @@ goto out; if (cb->server == -1) { - PRINTF(cb, "must be either client or server\n"); + printk(KERN_ERR PFX "must be either client or server\n"); ret = -EINVAL; goto out; } - if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { - PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n"); - ret = -EINVAL; - goto out; - } - if (cb->server_invalidate && cb->mem != FASTREG) { - PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n"); + if (cb->server && cb->frtest) { + printk(KERN_ERR PFX "must be client to run frtest\n"); ret = -EINVAL; goto out; } - if (cb->read_inv && cb->mem != FASTREG) { - PRINTF(cb, "read_inv only valid with fastreg mem_mode\n"); + if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { + printk(KERN_ERR PFX "Pick only one test: fr, bw, rlat, wlat\n"); ret = -EINVAL; goto out; } - if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) { - PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n"); + if (cb->wlat || cb->rlat || cb->bw) { + printk(KERN_ERR PFX "wlat, rlat, and bw tests only support mem_mode MR - which is no longer supported\n"); ret = -EINVAL; goto out; } - cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); + cb->cm_id = rdma_create_id(&init_net, krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); - PRINTF(cb, "rdma_create_id error %d\n", ret); + printk(KERN_ERR PFX "rdma_create_id error %d\n", ret); goto out; } - DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id); + DEBUG_LOG("created cm_id %p\n", cb->cm_id); if (cb->server) krping_run_server(cb); else krping_run_client(cb); - DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id); + DEBUG_LOG("destroy cm_id %p\n", cb->cm_id); rdma_destroy_id(cb->cm_id); out: mutex_lock(&krping_mutex); @@ -3339,9 +2142,3 @@ (*f)(cb->pd ? &cb->stats : NULL, arg); mutex_unlock(&krping_mutex); } - -void krping_init(void) -{ - - mutex_init(&krping_mutex); -} Index: sys/contrib/rdma/krping/krping_dev.c =================================================================== --- sys/contrib/rdma/krping/krping_dev.c +++ sys/contrib/rdma/krping/krping_dev.c @@ -72,7 +72,6 @@ switch (what) { case MOD_LOAD: /* kldload */ - krping_init(); krping_dev = make_dev(&krping_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "krping"); printf("Krping device loaded.\n"); @@ -204,7 +203,7 @@ *cp = 0; krpingmsg->len = (unsigned long)(cp - krpingmsg->msg); uprintf("krping: write string = |%s|\n", krpingmsg->msg); - err = krping_doit(krpingmsg->msg, curproc); + err = krping_doit(krpingmsg->msg); free(krpingmsg, M_DEVBUF); return(err); } Index: sys/modules/ibcore/Makefile =================================================================== --- sys/modules/ibcore/Makefile +++ sys/modules/ibcore/Makefile @@ -2,21 +2,40 @@ .PATH: ${SRCTOP}/sys/ofed/drivers/infiniband/core KMOD= ibcore -SRCS= addr.c iwcm.c sa_query.c ucma.c uverbs_cmd.c \ - agent.c multicast.c smi.c ud_header.c uverbs_main.c \ - mad.c peer_mem.c umem.c uverbs_marshall.c \ - cache.c device.c packer.c sysfs.c user_mad.c verbs.c \ - cm.c fmr_pool.c mad_rmpp.c ucm.c cma.c \ - vnode_if.h device_if.h bus_if.h pci_if.h \ - opt_inet.h opt_inet6.h +SRCS= vnode_if.h device_if.h bus_if.h pci_if.h \ + opt_inet.h opt_inet6.h \ + ib_addr.c \ + ib_agent.c \ + ib_cache.c \ + ib_cm.c \ + ib_cma.c \ + ib_cq.c \ + ib_device.c \ + ib_fmr_pool.c \ + ib_iwcm.c \ + ib_iwpm_msg.c \ + ib_iwpm_util.c \ + ib_mad.c \ + ib_mad_rmpp.c \ + ib_multicast.c \ + ib_packer.c \ + ib_roce_gid_mgmt.c \ + ib_sa_query.c \ + ib_smi.c \ + ib_sysfs.c \ + ib_ucm.c \ + ib_ucma.c \ + ib_ud_header.c \ + ib_umem.c \ + ib_user_mad.c \ + ib_uverbs_cmd.c \ + ib_uverbs_main.c \ + ib_uverbs_marshall.c \ + ib_verbs.c -CFLAGS+= -I${SRCTOP}/sys/ofed/drivers/infiniband/core CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include -CFLAGS+= -DINET6 -DINET +CFLAGS+= -DINET6 -DINET -DCONFIG_INFINIBAND_USER_MEM .include - -CFLAGS+= -Wno-cast-qual -Wno-pointer-arith - -CWARNFLAGS.cm.c= -Wno-unused-function Index: sys/modules/ipoib/Makefile =================================================================== --- sys/modules/ipoib/Makefile +++ sys/modules/ipoib/Makefile @@ -9,6 +9,7 @@ CFLAGS+= -I${SRCTOP}/sys/ofed/drivers/infiniband/ulp/ipoib CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include .include Index: sys/modules/rdma/krping/Makefile =================================================================== --- sys/modules/rdma/krping/Makefile +++ sys/modules/rdma/krping/Makefile @@ -6,6 +6,7 @@ SRCS+= bus_if.h device_if.h pci_if.h pcib_if.h vnode_if.h SRCS+= opt_sched.h opt_inet.h opt_inet6.h CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include .include Index: sys/ofed/drivers/infiniband/core/addr.c =================================================================== --- sys/ofed/drivers/infiniband/core/addr.c +++ /dev/null @@ -1,686 +0,0 @@ -/* - * Copyright (c) 2005 Voltaire Inc. All rights reserved. - * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. - * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("IB Address Translation"); -MODULE_LICENSE("Dual BSD/GPL"); - -struct addr_req { - struct list_head list; - struct sockaddr_storage src_addr; - struct sockaddr_storage dst_addr; - struct rdma_dev_addr *addr; - struct rdma_addr_client *client; - void *context; - void (*callback)(int status, struct sockaddr *src_addr, - struct rdma_dev_addr *addr, void *context); - unsigned long timeout; - int status; -}; - -static void process_req(struct work_struct *work); - -static DEFINE_MUTEX(lock); -static LIST_HEAD(req_list); -static struct delayed_work work; -static struct workqueue_struct *addr_wq; - -static struct rdma_addr_client self; -void rdma_addr_register_client(struct rdma_addr_client *client) -{ - atomic_set(&client->refcount, 1); - init_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_register_client); - -static inline void put_client(struct rdma_addr_client *client) -{ - if (atomic_dec_and_test(&client->refcount)) - complete(&client->comp); -} - -void rdma_addr_unregister_client(struct rdma_addr_client *client) -{ - put_client(client); - wait_for_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_unregister_client); - -int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, - const unsigned char *dst_dev_addr) -{ - if (dev->if_type == IFT_INFINIBAND) - dev_addr->dev_type = ARPHRD_INFINIBAND; - else if (dev->if_type == IFT_ETHER) - dev_addr->dev_type = ARPHRD_ETHER; - else - dev_addr->dev_type = 0; - memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen); - memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr), - dev->if_addrlen); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen); - dev_addr->bound_dev_if = dev->if_index; - return 0; -} -EXPORT_SYMBOL(rdma_copy_addr); - -#define SCOPE_ID_CACHE(_scope_id, _addr6) do { \ - (_addr6)->sin6_addr.s6_addr[3] = (_scope_id); \ - (_addr6)->sin6_scope_id = 0; } while (0) - -#define SCOPE_ID_RESTORE(_scope_id, _addr6) do { \ - (_addr6)->sin6_scope_id = (_scope_id); \ - (_addr6)->sin6_addr.s6_addr[3] = 0; } while (0) - -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, - u16 *vlan_id) -{ - struct net_device *dev; - int ret = -EADDRNOTAVAIL; - - if (dev_addr->bound_dev_if) { - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (!dev) - return -ENODEV; - ret = rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); - return ret; - } - - switch (addr->sa_family) { - case AF_INET: - dev = ip_dev_find(&init_net, - ((struct sockaddr_in *) addr)->sin_addr.s_addr); - - if (!dev) - return ret; - - ret = rdma_copy_addr(dev_addr, dev, NULL); - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - break; - -#if defined(INET6) - case AF_INET6: - { - struct sockaddr_in6 *sin6; - struct ifaddr *ifa; - in_port_t port; - uint32_t scope_id; - - sin6 = (struct sockaddr_in6 *)addr; - port = sin6->sin6_port; - sin6->sin6_port = 0; - scope_id = sin6->sin6_scope_id; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_CACHE(scope_id, sin6); - CURVNET_SET_QUIET(&init_net); - ifa = ifa_ifwithaddr(addr); - CURVNET_RESTORE(); - sin6->sin6_port = port; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_RESTORE(scope_id, sin6); - if (ifa == NULL) { - ret = -ENODEV; - break; - } - ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL); - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp); - ifa_free(ifa); - break; - } -#endif - default: - break; - } - return ret; -} -EXPORT_SYMBOL(rdma_translate_ip); - -static void set_timeout(unsigned long time) -{ - unsigned long delay; - - delay = time - jiffies; - if ((long)delay <= 0) - delay = 1; - - mod_delayed_work(addr_wq, &work, delay); -} - -static void queue_req(struct addr_req *req) -{ - struct addr_req *temp_req; - - mutex_lock(&lock); - list_for_each_entry_reverse(temp_req, &req_list, list) { - if (time_after_eq(req->timeout, temp_req->timeout)) - break; - } - - list_add(&req->list, &temp_req->list); - - if (req_list.next == &req->list) - set_timeout(req->timeout); - mutex_unlock(&lock); -} - -static int addr_resolve(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) -{ - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - struct ifaddr *ifa; - struct ifnet *ifp; - struct rtentry *rte; -#if defined(INET) || defined(INET6) - in_port_t port; -#endif -#ifdef INET6 - uint32_t scope_id; -#endif - u_char edst[MAX_ADDR_LEN]; - int multi; - int bcast; - int is_gw = 0; - int error = 0; - - CURVNET_SET_QUIET(&init_net); - - /* - * Determine whether the address is unicast, multicast, or broadcast - * and whether the source interface is valid. - */ - multi = 0; - bcast = 0; - sin = NULL; - sin6 = NULL; - ifp = NULL; - rte = NULL; - ifa = NULL; - ifp = NULL; - memset(edst, 0, sizeof(edst)); -#ifdef INET6 - scope_id = -1U; -#endif - - switch (dst_in->sa_family) { -#ifdef INET - case AF_INET: - sin = (struct sockaddr_in *)dst_in; - if (sin->sin_addr.s_addr == INADDR_BROADCAST) - bcast = 1; - if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) - multi = 1; - sin = (struct sockaddr_in *)src_in; - if (sin->sin_addr.s_addr != INADDR_ANY) { - /* - * Address comparison fails if the port is set - * cache it here to be restored later. - */ - port = sin->sin_port; - sin->sin_port = 0; - memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - - /* - * If we have a source address to use look it - * up first and verify that it is a local - * interface: - */ - CURVNET_SET_QUIET(&init_net); - ifa = ifa_ifwithaddr(src_in); - CURVNET_RESTORE(); - sin->sin_port = port; - if (ifa == NULL) { - error = ENETUNREACH; - goto done; - } - ifp = ifa->ifa_ifp; - ifa_free(ifa); - if (bcast || multi) - goto mcast; - } - break; -#endif -#ifdef INET6 - case AF_INET6: - sin6 = (struct sockaddr_in6 *)dst_in; - if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) - multi = 1; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { - /* - * The IB address comparison fails if the - * scope ID is set and not part of the addr: - */ - scope_id = sin6->sin6_scope_id; - if (scope_id < 256) - SCOPE_ID_CACHE(scope_id, sin6); - } - sin6 = (struct sockaddr_in6 *)src_in; - if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - port = sin6->sin6_port; - sin6->sin6_port = 0; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { - if (scope_id < 256) - SCOPE_ID_CACHE(scope_id, sin6); - } - - /* - * If we have a source address to use look it - * up first and verify that it is a local - * interface: - */ - CURVNET_SET_QUIET(&init_net); - ifa = ifa_ifwithaddr(src_in); - CURVNET_RESTORE(); - sin6->sin6_port = port; - if (ifa == NULL) { - error = ENETUNREACH; - goto done; - } - ifp = ifa->ifa_ifp; - ifa_free(ifa); - if (bcast || multi) - goto mcast; - } - break; -#endif - default: - error = EINVAL; - goto done; - } - /* - * Make sure the route exists and has a valid link. - */ - rte = rtalloc1(dst_in, 1, 0); - if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { - if (rte) - RTFREE_LOCKED(rte); - error = EHOSTUNREACH; - goto done; - } - if (rte->rt_flags & RTF_GATEWAY) - is_gw = 1; - /* - * If it's not multicast or broadcast and the route doesn't match the - * requested interface return unreachable. Otherwise fetch the - * correct interface pointer and unlock the route. - */ - if (multi || bcast) { - if (ifp == NULL) { - ifp = rte->rt_ifp; - /* rt_ifa holds the route answer source address */ - ifa = rte->rt_ifa; - } - RTFREE_LOCKED(rte); - } else if (ifp && ifp != rte->rt_ifp) { - RTFREE_LOCKED(rte); - error = ENETUNREACH; - goto done; - } else { - if (ifp == NULL) { - ifp = rte->rt_ifp; - ifa = rte->rt_ifa; - } - RT_UNLOCK(rte); - } -#if defined(INET) || defined(INET6) -mcast: -#endif - if (bcast) { - memcpy(edst, ifp->if_broadcastaddr, ifp->if_addrlen); - goto done; - } else if (multi) { - struct sockaddr *llsa; - struct sockaddr_dl sdl; - - sdl.sdl_len = sizeof(sdl); - llsa = (struct sockaddr *)&sdl; - - if (ifp->if_resolvemulti == NULL) { - error = EOPNOTSUPP; - goto done; - } - error = ifp->if_resolvemulti(ifp, &llsa, dst_in); - if (error == 0) { - memcpy(edst, LLADDR((struct sockaddr_dl *)llsa), - ifp->if_addrlen); - } - goto done; - } - /* - * Resolve the link local address. - */ - switch (dst_in->sa_family) { -#ifdef INET - case AF_INET: - error = arpresolve(ifp, is_gw, NULL, - is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL); - break; -#endif -#ifdef INET6 - case AF_INET6: - error = nd6_resolve(ifp, is_gw, NULL, - is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL); - break; -#endif - default: - KASSERT(0, ("rdma_addr_resolve: Unreachable")); - error = EINVAL; - break; - } - RTFREE(rte); -done: - if (error == 0) - error = -rdma_copy_addr(addr, ifp, edst); - if (error == 0) - memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr)); -#ifdef INET6 - if (scope_id < 256) { - sin6 = (struct sockaddr_in6 *)src_in; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_RESTORE(scope_id, sin6); - sin6 = (struct sockaddr_in6 *)dst_in; - SCOPE_ID_RESTORE(scope_id, sin6); - } -#endif - if (error == EWOULDBLOCK) - error = ENODATA; - - CURVNET_RESTORE(); - return -error; -} - -static void process_req(struct work_struct *work) -{ - struct addr_req *req, *temp_req; - struct sockaddr *src_in, *dst_in; - struct list_head done_list; - - INIT_LIST_HEAD(&done_list); - - mutex_lock(&lock); - list_for_each_entry_safe(req, temp_req, &req_list, list) { - if (req->status == -ENODATA) { - src_in = (struct sockaddr *) &req->src_addr; - dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve(src_in, dst_in, req->addr); - if (req->status && time_after_eq(jiffies, req->timeout)) - req->status = -ETIMEDOUT; - else if (req->status == -ENODATA) - continue; - } - list_move_tail(&req->list, &done_list); - } - - if (!list_empty(&req_list)) { - req = list_entry(req_list.next, struct addr_req, list); - set_timeout(req->timeout); - } - mutex_unlock(&lock); - - list_for_each_entry_safe(req, temp_req, &done_list, list) { - list_del(&req->list); - req->callback(req->status, (struct sockaddr *) &req->src_addr, - req->addr, req->context); - put_client(req->client); - kfree(req); - } -} - -int rdma_resolve_ip(struct rdma_addr_client *client, - struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, - void (*callback)(int status, struct sockaddr *src_addr, - struct rdma_dev_addr *addr, void *context), - void *context) -{ - struct sockaddr *src_in, *dst_in; - struct addr_req *req; - int ret = 0; - - req = kzalloc(sizeof *req, GFP_KERNEL); - if (!req) - return -ENOMEM; - - src_in = (struct sockaddr *) &req->src_addr; - dst_in = (struct sockaddr *) &req->dst_addr; - - if (src_addr) { - if (src_addr->sa_family != dst_addr->sa_family) { - ret = -EINVAL; - goto err; - } - - memcpy(src_in, src_addr, ip_addr_size(src_addr)); - } else { - src_in->sa_family = dst_addr->sa_family; - } - - memcpy(dst_in, dst_addr, ip_addr_size(dst_addr)); - req->addr = addr; - req->callback = callback; - req->context = context; - req->client = client; - atomic_inc(&client->refcount); - - req->status = addr_resolve(src_in, dst_in, addr); - switch (req->status) { - case 0: - req->timeout = jiffies; - queue_req(req); - break; - case -ENODATA: - req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; - queue_req(req); - break; - default: - ret = req->status; - atomic_dec(&client->refcount); - goto err; - } - return ret; -err: - kfree(req); - return ret; -} -EXPORT_SYMBOL(rdma_resolve_ip); - -void rdma_addr_cancel(struct rdma_dev_addr *addr) -{ - struct addr_req *req, *temp_req; - - mutex_lock(&lock); - list_for_each_entry_safe(req, temp_req, &req_list, list) { - if (req->addr == addr) { - req->status = -ECANCELED; - req->timeout = jiffies; - list_move(&req->list, &req_list); - set_timeout(req->timeout); - break; - } - } - mutex_unlock(&lock); -} -EXPORT_SYMBOL(rdma_addr_cancel); - -struct resolve_cb_context { - struct rdma_dev_addr *addr; - struct completion comp; -}; - -static void resolve_cb(int status, struct sockaddr *src_addr, - struct rdma_dev_addr *addr, void *context) -{ - memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct - rdma_dev_addr)); - complete(&((struct resolve_cb_context *)context)->comp); -} - -int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, - u16 *vlan_id, u32 scope_id) -{ - int ret = 0; - struct rdma_dev_addr dev_addr; - struct resolve_cb_context ctx; - struct net_device *dev; - - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } sgid_addr, dgid_addr; - - - ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid, scope_id); - if (ret) - return ret; - - ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid, scope_id); - if (ret) - return ret; - - memset(&dev_addr, 0, sizeof(dev_addr)); - - ctx.addr = &dev_addr; - init_completion(&ctx.comp); - ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); - if (ret) - return ret; - - wait_for_completion(&ctx.comp); - - memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); - dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); - if (!dev) - return -ENODEV; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - return ret; -} -EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); - -u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num) -{ -#ifdef INET6 - struct ifnet *ifp; - if (ib->get_netdev == NULL) - return (-1U); - ifp = ib->get_netdev(ib, port_num); - if (ifp == NULL) - return (-1U); - return (in6_getscopezone(ifp, IPV6_ADDR_SCOPE_LINKLOCAL)); -#else - return (-1U); -#endif -} - -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id, - u32 scope_id) -{ - int ret = 0; - struct rdma_dev_addr dev_addr; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } gid_addr; - - ret = rdma_gid2ip(&gid_addr._sockaddr, sgid, scope_id); - if (ret) - return ret; - memset(&dev_addr, 0, sizeof(dev_addr)); - ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); - if (ret) - return ret; - - memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); - return ret; -} -EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); - -static int netevent_callback(struct notifier_block *self, unsigned long event, - void *ctx) -{ - if (event == NETEVENT_NEIGH_UPDATE) { - set_timeout(jiffies); - } - return 0; -} - -static struct notifier_block nb = { - .notifier_call = netevent_callback -}; - -static int __init addr_init(void) -{ - INIT_DELAYED_WORK(&work, process_req); - addr_wq = create_singlethread_workqueue("ib_addr"); - if (!addr_wq) - return -ENOMEM; - - register_netevent_notifier(&nb); - rdma_addr_register_client(&self); - return 0; -} - -static void __exit addr_cleanup(void) -{ - rdma_addr_unregister_client(&self); - unregister_netevent_notifier(&nb); - destroy_workqueue(addr_wq); -} - -module_init(addr_init); -module_exit(addr_cleanup); Index: sys/ofed/drivers/infiniband/core/agent.h =================================================================== --- sys/ofed/drivers/infiniband/core/agent.h +++ sys/ofed/drivers/infiniband/core/agent.h @@ -44,8 +44,8 @@ extern int ib_agent_port_close(struct ib_device *device, int port_num); -extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, - struct ib_wc *wc, struct ib_device *device, - int port_num, int qpn); +extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa); #endif /* __AGENT_H_ */ Index: sys/ofed/drivers/infiniband/core/cache.c =================================================================== --- sys/ofed/drivers/infiniband/core/cache.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright (c) 2004 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2005 Voltaire, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#include - -#include "core_priv.h" - -struct ib_pkey_cache { - int table_len; - u16 table[0]; -}; - -struct ib_gid_cache { - int table_len; - union ib_gid table[0]; -}; - -struct ib_update_work { - struct work_struct work; - struct ib_device *device; - u8 port_num; -}; - -static inline int start_port(struct ib_device *device) -{ - return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; -} - -static inline int end_port(struct ib_device *device) -{ - return (device->node_type == RDMA_NODE_IB_SWITCH) ? - 0 : device->phys_port_cnt; -} - -int ib_get_cached_gid(struct ib_device *device, - u8 port_num, - int index, - union ib_gid *gid) -{ - struct ib_gid_cache *cache; - unsigned long flags; - int ret = -EINVAL; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - read_lock_irqsave(&device->cache.lock, flags); - - if (device->cache.gid_cache) { - cache = device->cache.gid_cache[port_num - start_port(device)]; - - if (cache && index >= 0 && index < cache->table_len) { - *gid = cache->table[index]; - ret = 0; - } - } - - read_unlock_irqrestore(&device->cache.lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_cached_gid); - -int ib_find_cached_gid(struct ib_device *device, - union ib_gid *gid, - u8 *port_num, - u16 *index) -{ - struct ib_gid_cache *cache; - unsigned long flags; - int p, i; - int ret = -ENOENT; - - *port_num = -1; - if (index) - *index = -1; - - read_lock_irqsave(&device->cache.lock, flags); - if (!device->cache.gid_cache) - goto out; - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - cache = device->cache.gid_cache[p]; - if (!cache) - continue; - for (i = 0; i < cache->table_len; ++i) { - if (!memcmp(gid, &cache->table[i], sizeof *gid)) { - *port_num = p + start_port(device); - if (index) - *index = i; - ret = 0; - goto out; - } - } - } -out: - read_unlock_irqrestore(&device->cache.lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_find_cached_gid); - -int ib_get_cached_pkey(struct ib_device *device, - u8 port_num, - int index, - u16 *pkey) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int ret = -EINVAL; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - read_lock_irqsave(&device->cache.lock, flags); - - if (device->cache.pkey_cache) { - cache = device->cache.pkey_cache[port_num - start_port(device)]; - - if (cache && index >= 0 && index < cache->table_len) { - *pkey = cache->table[index]; - ret = 0; - } - } - - read_unlock_irqrestore(&device->cache.lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_cached_pkey); - -int ib_find_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int i; - int ret = -ENOENT; - int partial_ix = -1; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - *index = -1; - - read_lock_irqsave(&device->cache.lock, flags); - - if (!device->cache.pkey_cache) - goto out; - - cache = device->cache.pkey_cache[port_num - start_port(device)]; - if (!cache) - goto out; - - for (i = 0; i < cache->table_len; ++i) - if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { - if (cache->table[i] & 0x8000) { - *index = i; - ret = 0; - break; - } else - partial_ix = i; - } - - if (ret && partial_ix >= 0) { - *index = partial_ix; - ret = 0; - } -out: - read_unlock_irqrestore(&device->cache.lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_find_cached_pkey); - -int ib_find_exact_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int i; - int ret = -ENOENT; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - *index = -1; - - read_lock_irqsave(&device->cache.lock, flags); - - if (!device->cache.pkey_cache) - goto out; - - cache = device->cache.pkey_cache[port_num - start_port(device)]; - if (!cache) - goto out; - - for (i = 0; i < cache->table_len; ++i) - if (cache->table[i] == pkey) { - *index = i; - ret = 0; - break; - } -out: - read_unlock_irqrestore(&device->cache.lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_find_exact_cached_pkey); - -int ib_get_cached_lmc(struct ib_device *device, - u8 port_num, - u8 *lmc) -{ - unsigned long flags; - int ret = -EINVAL; - - if (port_num < start_port(device) || port_num > end_port(device)) - return -EINVAL; - - read_lock_irqsave(&device->cache.lock, flags); - if (device->cache.lmc_cache) { - *lmc = device->cache.lmc_cache[port_num - start_port(device)]; - ret = 0; - } - read_unlock_irqrestore(&device->cache.lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_cached_lmc); - -static void ib_cache_update(struct ib_device *device, - u8 port) -{ - struct ib_port_attr *tprops = NULL; - struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; - struct ib_gid_cache *gid_cache = NULL, *old_gid_cache; - int i; - int ret; - - if (!(device->cache.pkey_cache && device->cache.gid_cache && - device->cache.lmc_cache)) - return; - - tprops = kmalloc(sizeof *tprops, GFP_KERNEL); - if (!tprops) - return; - - ret = ib_query_port(device, port, tprops); - if (ret) { - printk(KERN_WARNING "ib_query_port failed (%d) for %s\n", - ret, device->name); - goto err; - } - - pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * - sizeof *pkey_cache->table, GFP_KERNEL); - if (!pkey_cache) - goto err; - - pkey_cache->table_len = tprops->pkey_tbl_len; - - gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len * - sizeof *gid_cache->table, GFP_KERNEL); - if (!gid_cache) - goto err; - - gid_cache->table_len = tprops->gid_tbl_len; - - for (i = 0; i < pkey_cache->table_len; ++i) { - ret = ib_query_pkey(device, port, i, pkey_cache->table + i); - if (ret) { - printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); - goto err; - } - } - - for (i = 0; i < gid_cache->table_len; ++i) { - ret = ib_query_gid(device, port, i, gid_cache->table + i); - if (ret) { - printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); - goto err; - } - } - - write_lock_irq(&device->cache.lock); - - old_pkey_cache = device->cache.pkey_cache[port - start_port(device)]; - old_gid_cache = device->cache.gid_cache [port - start_port(device)]; - - device->cache.pkey_cache[port - start_port(device)] = pkey_cache; - device->cache.gid_cache [port - start_port(device)] = gid_cache; - - device->cache.lmc_cache[port - start_port(device)] = tprops->lmc; - - write_unlock_irq(&device->cache.lock); - - kfree(old_pkey_cache); - kfree(old_gid_cache); - kfree(tprops); - return; - -err: - kfree(pkey_cache); - kfree(gid_cache); - kfree(tprops); -} - -static void ib_cache_task(struct work_struct *_work) -{ - struct ib_update_work *work = - container_of(_work, struct ib_update_work, work); - - ib_cache_update(work->device, work->port_num); - kfree(work); -} - -static void ib_cache_event(struct ib_event_handler *handler, - struct ib_event *event) -{ - struct ib_update_work *work; - - if (event->event == IB_EVENT_PORT_ERR || - event->event == IB_EVENT_PORT_ACTIVE || - event->event == IB_EVENT_LID_CHANGE || - event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_SM_CHANGE || - event->event == IB_EVENT_CLIENT_REREGISTER || - event->event == IB_EVENT_GID_CHANGE) { - work = kmalloc(sizeof *work, GFP_ATOMIC); - if (work) { - INIT_WORK(&work->work, ib_cache_task); - work->device = event->device; - work->port_num = event->element.port_num; - queue_work(ib_wq, &work->work); - } - } -} - -static void ib_cache_setup_one(struct ib_device *device) -{ - int p; - - rwlock_init(&device->cache.lock); - - device->cache.pkey_cache = - kmalloc(sizeof *device->cache.pkey_cache * - (end_port(device) - start_port(device) + 1), GFP_KERNEL); - device->cache.gid_cache = - kmalloc(sizeof *device->cache.gid_cache * - (end_port(device) - start_port(device) + 1), GFP_KERNEL); - - device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * - (end_port(device) - - start_port(device) + 1), - GFP_KERNEL); - - if (!device->cache.pkey_cache || !device->cache.gid_cache || - !device->cache.lmc_cache) { - printk(KERN_WARNING "Couldn't allocate cache " - "for %s\n", device->name); - goto err; - } - - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - device->cache.pkey_cache[p] = NULL; - device->cache.gid_cache [p] = NULL; - ib_cache_update(device, p + start_port(device)); - } - - INIT_IB_EVENT_HANDLER(&device->cache.event_handler, - device, ib_cache_event); - if (ib_register_event_handler(&device->cache.event_handler)) - goto err_cache; - - return; - -err_cache: - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - kfree(device->cache.pkey_cache[p]); - kfree(device->cache.gid_cache[p]); - } - -err: - kfree(device->cache.pkey_cache); - kfree(device->cache.gid_cache); - kfree(device->cache.lmc_cache); - device->cache.pkey_cache = NULL; - device->cache.gid_cache = NULL; - device->cache.lmc_cache = NULL; -} - -static void ib_cache_cleanup_one(struct ib_device *device) -{ - int p; - - if (!(device->cache.pkey_cache && device->cache.gid_cache && - device->cache.lmc_cache)) - return; - - ib_unregister_event_handler(&device->cache.event_handler); - flush_workqueue(ib_wq); - - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - kfree(device->cache.pkey_cache[p]); - kfree(device->cache.gid_cache[p]); - } - - kfree(device->cache.pkey_cache); - kfree(device->cache.gid_cache); - kfree(device->cache.lmc_cache); -} - -static struct ib_client cache_client = { - .name = "cache", - .add = ib_cache_setup_one, - .remove = ib_cache_cleanup_one -}; - -int __init ib_cache_setup(void) -{ - return ib_register_client(&cache_client); -} - -void __exit ib_cache_cleanup(void) -{ - ib_unregister_client(&cache_client); -} Index: sys/ofed/drivers/infiniband/core/cm_msgs.h =================================================================== --- sys/ofed/drivers/infiniband/core/cm_msgs.h +++ sys/ofed/drivers/infiniband/core/cm_msgs.h @@ -103,7 +103,7 @@ /* local ACK timeout:5, rsvd:3 */ u8 alt_offset139; - u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE]; + u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; } __attribute__ ((packed)); @@ -801,7 +801,7 @@ __be16 rsvd; __be64 service_id; - u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE]; + u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; } __attribute__ ((packed)); struct cm_sidr_rep_msg { Index: sys/ofed/drivers/infiniband/core/core_priv.h =================================================================== --- sys/ofed/drivers/infiniband/core/core_priv.h +++ sys/ofed/drivers/infiniband/core/core_priv.h @@ -38,15 +38,111 @@ #include +#include + +#ifdef CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS +int cma_configfs_init(void); +void cma_configfs_exit(void); +#else +static inline int cma_configfs_init(void) +{ + return 0; +} + +static inline void cma_configfs_exit(void) +{ +} +#endif +struct cma_device; +void cma_ref_dev(struct cma_device *cma_dev); +void cma_deref_dev(struct cma_device *cma_dev); +typedef bool (*cma_device_filter)(struct ib_device *, void *); +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie); +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port); +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type); +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev); + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, - u8, struct kobject *)); + u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); -int ib_sysfs_setup(void); -void ib_sysfs_cleanup(void); - -int ib_cache_setup(void); +void ib_cache_setup(void); void ib_cache_cleanup(void); +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask); + +typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); + +enum ib_cache_gid_default_mode { + IB_CACHE_GID_DEFAULT_MODE_SET, + IB_CACHE_GID_DEFAULT_MODE_DELETE +}; + +int ib_cache_gid_parse_type_str(const char *buf); + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum ib_cache_gid_default_mode mode); + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev); + +int roce_gid_mgmt_init(void); +void roce_gid_mgmt_cleanup(void); + +int roce_rescan_device(struct ib_device *ib_dev); +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); + +int ib_cache_setup_one(struct ib_device *device); +void ib_cache_cleanup_one(struct ib_device *device); +void ib_cache_release_one(struct ib_device *device); + +static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, + struct net_device *upper) +{ + + /* TODO: add support for LAGG */ + upper = VLAN_TRUNKDEV(upper); + + return (dev == upper); +} + +int addr_init(void); +void addr_cleanup(void); + +int ib_mad_init(void); +void ib_mad_cleanup(void); + +int ib_sa_init(void); +void ib_sa_cleanup(void); + #endif /* _CORE_PRIV_H */ Index: sys/ofed/drivers/infiniband/core/ib_addr.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_addr.c @@ -0,0 +1,751 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "core_priv.h" + +struct addr_req { + struct list_head list; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; + struct rdma_dev_addr *addr; + struct rdma_addr_client *client; + void *context; + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context); + unsigned long timeout; + int status; +}; + +static void process_req(struct work_struct *work); + +static DEFINE_MUTEX(lock); +static LIST_HEAD(req_list); +static DECLARE_DELAYED_WORK(work, process_req); +static struct workqueue_struct *addr_wq; + +int rdma_addr_size(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct sockaddr_ib); + default: + return 0; + } +} +EXPORT_SYMBOL(rdma_addr_size); + +static struct rdma_addr_client self; + +void rdma_addr_register_client(struct rdma_addr_client *client) +{ + atomic_set(&client->refcount, 1); + init_completion(&client->comp); +} +EXPORT_SYMBOL(rdma_addr_register_client); + +static inline void put_client(struct rdma_addr_client *client) +{ + if (atomic_dec_and_test(&client->refcount)) + complete(&client->comp); +} + +void rdma_addr_unregister_client(struct rdma_addr_client *client) +{ + put_client(client); + wait_for_completion(&client->comp); +} +EXPORT_SYMBOL(rdma_addr_unregister_client); + +static inline void +rdma_copy_addr_sub(u8 *dst, const u8 *src, unsigned min, unsigned max) +{ + if (min > max) + min = max; + memcpy(dst, src, min); + memset(dst + min, 0, max - min); +} + +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, + const unsigned char *dst_dev_addr) +{ + if (dev->if_type == IFT_INFINIBAND) + dev_addr->dev_type = ARPHRD_INFINIBAND; + else if (dev->if_type == IFT_ETHER) + dev_addr->dev_type = ARPHRD_ETHER; + else + dev_addr->dev_type = 0; + rdma_copy_addr_sub(dev_addr->src_dev_addr, IF_LLADDR(dev), + dev->if_addrlen, MAX_ADDR_LEN); + rdma_copy_addr_sub(dev_addr->broadcast, dev->if_broadcastaddr, + dev->if_addrlen, MAX_ADDR_LEN); + if (dst_dev_addr != NULL) { + rdma_copy_addr_sub(dev_addr->dst_dev_addr, dst_dev_addr, + dev->if_addrlen, MAX_ADDR_LEN); + } + dev_addr->bound_dev_if = dev->if_index; + return 0; +} +EXPORT_SYMBOL(rdma_copy_addr); + +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, + u16 *vlan_id) +{ + struct net_device *dev = NULL; + int ret = -EADDRNOTAVAIL; + + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; + } + + switch (addr->sa_family) { +#ifdef INET + case AF_INET: + dev = ip_dev_find(dev_addr->net, + ((const struct sockaddr_in *)addr)->sin_addr.s_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: { + struct in6_addr in6_addr = ((const struct sockaddr_in6 *)addr)->sin6_addr; + + /* embed scope ID */ + in6_addr.s6_addr[3] = ((const struct sockaddr_in6 *)addr)->sin6_scope_id; + + dev = ip6_dev_find(dev_addr->net, in6_addr); + break; + } +#endif + default: + break; + } + + if (dev != NULL) { + ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); + } + return ret; +} +EXPORT_SYMBOL(rdma_translate_ip); + +static void set_timeout(unsigned long time) +{ + int delay; /* under FreeBSD ticks are 32-bit */ + + delay = time - jiffies; + if (delay <= 0) + delay = 1; + + mod_delayed_work(addr_wq, &work, delay); +} + +static void queue_req(struct addr_req *req) +{ + struct addr_req *temp_req; + + mutex_lock(&lock); + list_for_each_entry_reverse(temp_req, &req_list, list) { + if (time_after_eq(req->timeout, temp_req->timeout)) + break; + } + + list_add(&req->list, &temp_req->list); + + if (req_list.next == &req->list) + set_timeout(req->timeout); + mutex_unlock(&lock); +} + +#if defined(INET) || defined(INET6) +static int addr_resolve_multi(u8 *edst, struct ifnet *ifp, struct sockaddr *dst_in) +{ + struct sockaddr *llsa; + struct sockaddr_dl sdl; + int error; + + sdl.sdl_len = sizeof(sdl); + llsa = (struct sockaddr *)&sdl; + + if (ifp->if_resolvemulti == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_resolvemulti(ifp, &llsa, dst_in); + if (error == 0) { + rdma_copy_addr_sub(edst, LLADDR((struct sockaddr_dl *)llsa), + ifp->if_addrlen, MAX_ADDR_LEN); + } + } + return (error); +} +#endif + +#ifdef INET +static int addr4_resolve(struct sockaddr_in *src_in, + const struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + struct sockaddr_in dst_tmp = *dst_in; + u8 edst[MAX_ADDR_LEN]; + struct rtentry *rte; + struct ifnet *ifp; + int error; + + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin_len = sizeof(dst_tmp); + + CURVNET_SET(addr->net); + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + + /* + * Make sure the route exists and has a valid link. + */ + if (rte == NULL) { + error = EHOSTUNREACH; + goto done; + } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) { + RTFREE_LOCKED(rte); + error = EHOSTUNREACH; + goto done; + } else if (src_in->sin_addr.s_addr != INADDR_ANY) { + RT_UNLOCK(rte); + + ifp = ip_dev_find(addr->net, src_in->sin_addr.s_addr); + if (ifp == NULL) { + RTFREE(rte); + error = ENETUNREACH; + goto done; + } else if (ifp != rte->rt_ifp) { + error = ENETUNREACH; + goto failure; + } + } else { + struct sockaddr *saddr; + + ifp = rte->rt_ifp; + dev_hold(ifp); + + saddr = rte->rt_ifa->ifa_addr; + memcpy(src_in, saddr, rdma_addr_size(saddr)); + RT_UNLOCK(rte); + } + + /* + * Resolve destination MAC address + */ + if (dst_tmp.sin_addr.s_addr == INADDR_BROADCAST) { + rdma_copy_addr_sub(edst, ifp->if_broadcastaddr, + ifp->if_addrlen, MAX_ADDR_LEN); + } else if (IN_MULTICAST(ntohl(dst_tmp.sin_addr.s_addr))) { + error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); + if (error != 0) + goto failure; + } else { + bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + memset(edst, 0, sizeof(edst)); + error = arpresolve(ifp, is_gw, NULL, is_gw ? + rte->rt_gateway : (const struct sockaddr *)&dst_tmp, + edst, NULL, NULL); + if (error != 0) + goto failure; + else if (is_gw != 0) + addr->network = RDMA_NETWORK_IPV4; + } + + /* + * Copy destination and source MAC addresses + */ + error = -rdma_copy_addr(addr, ifp, edst); + if (error != 0) { +failure: + dev_put(ifp); + + if (error == EWOULDBLOCK || error == EAGAIN) + error = ENODATA; + } else { + *ifpp = ifp; + } + RTFREE(rte); +done: + return (-error); +} +#else +static int addr4_resolve(struct sockaddr_in *src_in, + const struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + return -EADDRNOTAVAIL; +} +#endif + +#ifdef INET6 +static int addr6_resolve(struct sockaddr_in6 *src_in, + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + struct sockaddr_in6 dst_tmp = *dst_in; + u8 edst[MAX_ADDR_LEN]; + struct rtentry *rte; + struct ifnet *ifp; + int error; + + sa6_embedscope(&dst_tmp, 0); + sa6_embedscope(src_in, 0); + + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin6_len = sizeof(dst_tmp); + + CURVNET_SET(addr->net); + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + + /* + * Make sure the route exists and has a valid link. + */ + if (rte == NULL) { + error = EHOSTUNREACH; + goto done; + } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) { + RTFREE_LOCKED(rte); + error = EHOSTUNREACH; + goto done; + } else if (!IN6_IS_ADDR_UNSPECIFIED(&src_in->sin6_addr)) { + RT_UNLOCK(rte); + + ifp = ip6_dev_find(addr->net, src_in->sin6_addr); + if (ifp == NULL) { + RTFREE(rte); + error = ENETUNREACH; + goto done; + } else if (ifp != rte->rt_ifp) { + error = ENETUNREACH; + goto failure; + } + } else { + struct sockaddr *saddr; + + ifp = rte->rt_ifp; + dev_hold(ifp); + + saddr = rte->rt_ifa->ifa_addr; + memcpy(src_in, saddr, rdma_addr_size(saddr)); + RT_UNLOCK(rte); + } + + /* + * Resolve destination MAC address + */ + if (IN6_IS_ADDR_MULTICAST(&dst_tmp.sin6_addr)) { + error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp); + if (error != 0) + goto failure; + } else { + bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0; + memset(edst, 0, sizeof(edst)); + error = nd6_resolve(ifp, is_gw, NULL, is_gw ? + rte->rt_gateway : (const struct sockaddr *)&dst_tmp, + edst, NULL, NULL); + if (error != 0) + goto failure; + else if (is_gw != 0) + addr->network = RDMA_NETWORK_IPV6; + } + + /* + * Copy destination and source MAC addresses + */ + error = -rdma_copy_addr(addr, ifp, edst); + if (error != 0) { +failure: + dev_put(ifp); + + if (error == EWOULDBLOCK || error == EAGAIN) + error = ENODATA; + } else { + *ifpp = ifp; + } + RTFREE(rte); +done: + sa6_recoverscope(&dst_tmp); + sa6_recoverscope(src_in); + + return (-error); +} +#else +static int addr6_resolve(struct sockaddr_in6 *src_in, + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct ifnet **ifpp) +{ + return -EADDRNOTAVAIL; +} +#endif + +static int addr_resolve_neigh(struct ifnet *dev, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr) +{ + if (dev->if_flags & IFF_LOOPBACK) { + int ret; + + ret = rdma_translate_ip(dst_in, addr, NULL); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, + MAX_ADDR_LEN); + + return ret; + } + + /* If the device doesn't do ARP internally */ + if (!(dev->if_flags & IFF_NOARP)) + return 0; + + return rdma_copy_addr(addr, dev, NULL); +} + +static int addr_resolve(struct sockaddr *src_in, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr, + bool resolve_neigh) +{ + struct net_device *ndev = NULL; + int ret; + + if (dst_in->sa_family != src_in->sa_family) + return -EINVAL; + + if (src_in->sa_family == AF_INET) { + ret = addr4_resolve((struct sockaddr_in *)src_in, + (const struct sockaddr_in *)dst_in, + addr, &ndev); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(ndev, dst_in, addr); + } else { + ret = addr6_resolve((struct sockaddr_in6 *)src_in, + (const struct sockaddr_in6 *)dst_in, addr, + &ndev); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(ndev, dst_in, addr); + } + + addr->bound_dev_if = ndev->if_index; + addr->net = dev_net(ndev); + dev_put(ndev); + + return ret; +} + +static void process_req(struct work_struct *work) +{ + struct addr_req *req, *temp_req; + struct sockaddr *src_in, *dst_in; + struct list_head done_list; + + INIT_LIST_HEAD(&done_list); + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->status == -ENODATA) { + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + req->status = addr_resolve(src_in, dst_in, req->addr, + true); + if (req->status && time_after_eq(jiffies, req->timeout)) + req->status = -ETIMEDOUT; + else if (req->status == -ENODATA) + continue; + } + list_move_tail(&req->list, &done_list); + } + + if (!list_empty(&req_list)) { + req = list_entry(req_list.next, struct addr_req, list); + set_timeout(req->timeout); + } + mutex_unlock(&lock); + + list_for_each_entry_safe(req, temp_req, &done_list, list) { + list_del(&req->list); + req->callback(req->status, (struct sockaddr *) &req->src_addr, + req->addr, req->context); + put_client(req->client); + kfree(req); + } +} + +int rdma_resolve_ip(struct rdma_addr_client *client, + struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context) +{ + struct sockaddr *src_in, *dst_in; + struct addr_req *req; + int ret = 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) { + ret = -EINVAL; + goto err; + } + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); + req->addr = addr; + req->callback = callback; + req->context = context; + req->client = client; + atomic_inc(&client->refcount); + + req->status = addr_resolve(src_in, dst_in, addr, true); + switch (req->status) { + case 0: + req->timeout = jiffies; + queue_req(req); + break; + case -ENODATA: + req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; + queue_req(req); + break; + default: + ret = req->status; + atomic_dec(&client->refcount); + goto err; + } + return ret; +err: + kfree(req); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ip); + +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr) +{ + struct sockaddr_storage ssrc_addr = {}; + struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) + return -EINVAL; + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + return addr_resolve(src_in, dst_addr, addr, false); +} +EXPORT_SYMBOL(rdma_resolve_ip_route); + +void rdma_addr_cancel(struct rdma_dev_addr *addr) +{ + struct addr_req *req, *temp_req; + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->addr == addr) { + req->status = -ECANCELED; + req->timeout = jiffies; + list_move(&req->list, &req_list); + set_timeout(req->timeout); + break; + } + } + mutex_unlock(&lock); +} +EXPORT_SYMBOL(rdma_addr_cancel); + +struct resolve_cb_context { + struct rdma_dev_addr *addr; + struct completion comp; + int status; +}; + +static void resolve_cb(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + if (!status) + memcpy(((struct resolve_cb_context *)context)->addr, + addr, sizeof(struct rdma_dev_addr)); + ((struct resolve_cb_context *)context)->status = status; + complete(&((struct resolve_cb_context *)context)->comp); +} + +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, u16 *vlan_id, int *if_index, + int *hoplimit) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + struct resolve_cb_context ctx; + struct net_device *dev; + + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + + rdma_gid2ip(&sgid_addr._sockaddr, sgid); + rdma_gid2ip(&dgid_addr._sockaddr, dgid); + + memset(&dev_addr, 0, sizeof(dev_addr)); + if (if_index) + dev_addr.bound_dev_if = *if_index; + dev_addr.net = TD_TO_VNET(curthread); + + ctx.addr = &dev_addr; + init_completion(&ctx.comp); + ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); + if (ret) + return ret; + + wait_for_completion(&ctx.comp); + + ret = ctx.status; + if (ret) + return ret; + + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); + dev = dev_get_by_index(dev_addr.net, dev_addr.bound_dev_if); + if (!dev) + return -ENODEV; + if (if_index) + *if_index = dev_addr.bound_dev_if; + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + if (hoplimit) + *hoplimit = dev_addr.hoplimit; + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) +{ + int ret = 0; + struct rdma_dev_addr dev_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } gid_addr; + + rdma_gid2ip(&gid_addr._sockaddr, sgid); + + memset(&dev_addr, 0, sizeof(dev_addr)); + dev_addr.net = TD_TO_VNET(curthread); + ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); + if (ret) + return ret; + + memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); + return ret; +} +EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); + +int addr_init(void) +{ + addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); + if (!addr_wq) + return -ENOMEM; + + rdma_addr_register_client(&self); + + return 0; +} + +void addr_cleanup(void) +{ + rdma_addr_unregister_client(&self); + destroy_workqueue(addr_wq); +} Index: sys/ofed/drivers/infiniband/core/ib_agent.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_agent.c +++ sys/ofed/drivers/infiniband/core/ib_agent.c @@ -54,7 +54,7 @@ static LIST_HEAD(ib_agent_port_list); static struct ib_agent_port_private * -__ib_get_agent_port(struct ib_device *device, int port_num) +__ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; @@ -67,7 +67,7 @@ } static struct ib_agent_port_private * -ib_get_agent_port(struct ib_device *device, int port_num) +ib_get_agent_port(const struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; unsigned long flags; @@ -78,9 +78,9 @@ return entry; } -void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, - struct ib_wc *wc, struct ib_device *device, - int port_num, int qpn) +void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa) { struct ib_agent_port_private *port_priv; struct ib_mad_agent *agent; @@ -88,44 +88,49 @@ struct ib_ah *ah; struct ib_mad_send_wr_private *mad_send_wr; - if (device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(device)) port_priv = ib_get_agent_port(device, 0); else port_priv = ib_get_agent_port(device, port_num); if (!port_priv) { - printk(KERN_ERR SPFX "Unable to find port agent\n"); + dev_err(&device->dev, "Unable to find port agent\n"); return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n", + dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", PTR_ERR(ah)); return; } + if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION) + resp_mad_len = IB_MGMT_MAD_SIZE; + send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, - IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_KERNEL); + IB_MGMT_MAD_HDR, + resp_mad_len - IB_MGMT_MAD_HDR, + GFP_KERNEL, + mad_hdr->base_version); if (IS_ERR(send_buf)) { - printk(KERN_ERR SPFX "ib_create_send_mad error\n"); + dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } - memcpy(send_buf->mad, mad, sizeof *mad); + memcpy(send_buf->mad, mad_hdr, resp_mad_len); send_buf->ah = ah; - if (device->node_type == RDMA_NODE_IB_SWITCH) { + if (rdma_cap_ib_switch(device)) { mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); - mad_send_wr->send_wr.wr.ud.port_num = port_num; + mad_send_wr->send_wr.port_num = port_num; } if (ib_post_send_mad(send_buf, NULL)) { - printk(KERN_ERR SPFX "ib_post_send_mad error\n"); + dev_err(&device->dev, "ib_post_send_mad error\n"); goto err2; } return; @@ -151,17 +156,17 @@ /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n"); + dev_err(&device->dev, "No memory for ib_agent_port_private\n"); ret = -ENOMEM; goto error1; } - if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_smi(device, port_num)) { /* Obtain send only MAD agent for SMI QP */ port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[0])) { ret = PTR_ERR(port_priv->agent[0]); goto error2; @@ -172,7 +177,7 @@ port_priv->agent[1] = ib_register_mad_agent(device, port_num, IB_QPT_GSI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[1])) { ret = PTR_ERR(port_priv->agent[1]); goto error3; @@ -202,7 +207,7 @@ port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - printk(KERN_ERR SPFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del(&port_priv->port_list); Index: sys/ofed/drivers/infiniband/core/ib_cache.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_cache.c @@ -0,0 +1,1253 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +#include "core_priv.h" + +struct ib_pkey_cache { + int table_len; + u16 table[0]; +}; + +struct ib_update_work { + struct work_struct work; + struct ib_device *device; + u8 port_num; +}; + +union ib_gid zgid; +EXPORT_SYMBOL(zgid); + +static const struct ib_gid_attr zattr; + +enum gid_attr_find_mask { + GID_ATTR_FIND_MASK_GID = 1UL << 0, + GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, + GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, + GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, +}; + +enum gid_table_entry_props { + GID_TABLE_ENTRY_INVALID = 1UL << 0, + GID_TABLE_ENTRY_DEFAULT = 1UL << 1, +}; + +enum gid_table_write_action { + GID_TABLE_WRITE_ACTION_ADD, + GID_TABLE_WRITE_ACTION_DEL, + /* MODIFY only updates the GID table. Currently only used by + * ib_cache_update. + */ + GID_TABLE_WRITE_ACTION_MODIFY +}; + +struct ib_gid_table_entry { + unsigned long props; + union ib_gid gid; + struct ib_gid_attr attr; + void *context; +}; + +struct ib_gid_table { + int sz; + /* In RoCE, adding a GID to the table requires: + * (a) Find if this GID is already exists. + * (b) Find a free space. + * (c) Write the new GID + * + * Delete requires different set of operations: + * (a) Find the GID + * (b) Delete it. + * + * Add/delete should be carried out atomically. + * This is done by locking this mutex from multiple + * writers. We don't need this lock for IB, as the MAD + * layer replaces all entries. All data_vec entries + * are locked by this lock. + **/ + struct mutex lock; + /* This lock protects the table entries from being + * read and written simultaneously. + */ + rwlock_t rwlock; + struct ib_gid_table_entry *data_vec; +}; + +static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +{ + if (rdma_cap_roce_gid_table(ib_dev, port)) { + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event(&event); + } +} + +static const char * const gid_type_str[] = { + [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", +}; + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) +{ + if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) + return gid_type_str[gid_type]; + + return "Invalid GID type"; +} +EXPORT_SYMBOL(ib_cache_gid_type_str); + +int ib_cache_gid_parse_type_str(const char *buf) +{ + unsigned int i; + size_t len; + int err = -EINVAL; + + len = strlen(buf); + if (len == 0) + return -EINVAL; + + if (buf[len - 1] == '\n') + len--; + + for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) + if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && + len == strlen(gid_type_str[i])) { + err = i; + break; + } + + return err; +} +EXPORT_SYMBOL(ib_cache_gid_parse_type_str); + +/* This function expects that rwlock will be write locked in all + * scenarios and that lock will be locked in sleep-able (RoCE) + * scenarios. + */ +static int write_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + enum gid_table_write_action action, + bool default_gid) + __releases(&table->rwlock) __acquires(&table->rwlock) +{ + int ret = 0; + struct net_device *old_net_dev; + enum ib_gid_type old_gid_type; + + /* in rdma_cap_roce_gid_table, this funciton should be protected by a + * sleep-able lock. + */ + + if (rdma_cap_roce_gid_table(ib_dev, port)) { + table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; + write_unlock_irq(&table->rwlock); + /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by + * RoCE providers and thus only updates the cache. + */ + if (action == GID_TABLE_WRITE_ACTION_ADD) + ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr, + &table->data_vec[ix].context); + else if (action == GID_TABLE_WRITE_ACTION_DEL) + ret = ib_dev->del_gid(ib_dev, port, ix, + &table->data_vec[ix].context); + write_lock_irq(&table->rwlock); + } + + old_net_dev = table->data_vec[ix].attr.ndev; + old_gid_type = table->data_vec[ix].attr.gid_type; + if (old_net_dev && old_net_dev != attr->ndev) + dev_put(old_net_dev); + /* if modify_gid failed, just delete the old gid */ + if (ret || action == GID_TABLE_WRITE_ACTION_DEL) { + gid = &zgid; + attr = &zattr; + table->data_vec[ix].context = NULL; + } + + memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); + memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); + if (default_gid) { + table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; + if (action == GID_TABLE_WRITE_ACTION_DEL) + table->data_vec[ix].attr.gid_type = old_gid_type; + } + if (table->data_vec[ix].attr.ndev && + table->data_vec[ix].attr.ndev != old_net_dev) + dev_hold(table->data_vec[ix].attr.ndev); + + table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; + + return ret; +} + +static int add_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, gid, attr, + GID_TABLE_WRITE_ACTION_ADD, default_gid); +} + +static int modify_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, gid, attr, + GID_TABLE_WRITE_ACTION_MODIFY, default_gid); +} + +static int del_gid(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table, int ix, + bool default_gid) { + return write_gid(ib_dev, port, table, ix, &zgid, &zattr, + GID_TABLE_WRITE_ACTION_DEL, default_gid); +} + +/* rwlock should be read locked */ +static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, + const struct ib_gid_attr *val, bool default_gid, + unsigned long mask, int *pempty) +{ + int i = 0; + int found = -1; + int empty = pempty ? -1 : 0; + + while (i < table->sz && (found < 0 || empty < 0)) { + struct ib_gid_table_entry *data = &table->data_vec[i]; + struct ib_gid_attr *attr = &data->attr; + int curr_index = i; + + i++; + + if (data->props & GID_TABLE_ENTRY_INVALID) + continue; + + if (empty < 0) + if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && + !memcmp(attr, &zattr, sizeof(*attr)) && + !data->props) + empty = curr_index; + + if (found >= 0) + continue; + + if (mask & GID_ATTR_FIND_MASK_GID_TYPE && + attr->gid_type != val->gid_type) + continue; + + if (mask & GID_ATTR_FIND_MASK_GID && + memcmp(gid, &data->gid, sizeof(*gid))) + continue; + + if (mask & GID_ATTR_FIND_MASK_NETDEV && + attr->ndev != val->ndev) + continue; + + if (mask & GID_ATTR_FIND_MASK_DEFAULT && + !!(data->props & GID_TABLE_ENTRY_DEFAULT) != + default_gid) + continue; + + found = curr_index; + } + + if (pempty) + *pempty = empty; + + return found; +} + +static void addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +{ + if (dev->if_addrlen != ETH_ALEN) + return; + memcpy(eui, IF_LLADDR(dev), 3); + memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); + + /* NOTE: The scope ID is added by the GID to IP conversion */ + + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; +} + +static void make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + addrconf_ifid_eui48(&gid->raw[8], dev); +} + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + int ret = 0; + struct net_device *idev; + int empty; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (!memcmp(gid, &zgid, sizeof(*gid))) + return -EINVAL; + + if (ib_dev->get_netdev) { + idev = ib_dev->get_netdev(ib_dev, port); + if (idev && attr->ndev != idev) { + union ib_gid default_gid; + + /* Adding default GIDs in not permitted */ + make_default_gid(idev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) { + dev_put(idev); + return -EPERM; + } + } + if (idev) + dev_put(idev); + } + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + + ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV, &empty); + if (ix >= 0) + goto out_unlock; + + if (empty < 0) { + ret = -ENOSPC; + goto out_unlock; + } + + ret = add_gid(ib_dev, port, table, empty, gid, attr, false); + if (!ret) + dispatch_gid_change_event(ib_dev, port); + +out_unlock: + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + return ret; +} + +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + + ix = find_gid(table, gid, attr, false, + GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV | + GID_ATTR_FIND_MASK_DEFAULT, + NULL); + if (ix < 0) + goto out_unlock; + + if (!del_gid(ib_dev, port, table, ix, false)) + dispatch_gid_change_event(ib_dev, port); + +out_unlock: + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + return 0; +} + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + int ix; + bool deleted = false; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + + for (ix = 0; ix < table->sz; ix++) + if (table->data_vec[ix].attr.ndev == ndev) + if (!del_gid(ib_dev, port, table, ix, + !!(table->data_vec[ix].props & + GID_TABLE_ENTRY_DEFAULT))) + deleted = true; + + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); + + return 0; +} + +static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (index < 0 || index >= table->sz) + return -EINVAL; + + if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) + return -EAGAIN; + + memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); + if (attr) { + memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); + if (attr->ndev) + dev_hold(attr->ndev); + } + + return 0; +} + +static int _ib_cache_gid_table_find(struct ib_device *ib_dev, + const union ib_gid *gid, + const struct ib_gid_attr *val, + unsigned long mask, + u8 *port, u16 *index) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + u8 p; + int local_index; + unsigned long flags; + + for (p = 0; p < ib_dev->phys_port_cnt; p++) { + table = ports_table[p]; + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, val, false, mask, NULL); + if (local_index >= 0) { + if (index) + *index = local_index; + if (port) + *port = p + rdma_start_port(ib_dev); + read_unlock_irqrestore(&table->rwlock, flags); + return 0; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return -ENOENT; +} + +static int ib_cache_gid_find(struct ib_device *ib_dev, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev, u8 *port, + u16 *index) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, + mask, port, index); +} + +int ib_find_cached_gid_by_port(struct ib_device *ib_dev, + const union ib_gid *gid, + enum ib_gid_type gid_type, + u8 port, struct net_device *ndev, + u16 *index) +{ + int local_index; + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; + unsigned long flags; + + if (port < rdma_start_port(ib_dev) || + port > rdma_end_port(ib_dev)) + return -ENOENT; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, &val, false, mask, NULL); + if (local_index >= 0) { + if (index) + *index = local_index; + read_unlock_irqrestore(&table->rwlock, flags); + return 0; + } + + read_unlock_irqrestore(&table->rwlock, flags); + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_cached_gid_by_port); + +/** + * ib_find_gid_by_filter - Returns the GID table index where a specified + * GID value occurs + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value could be + * searched. + * @filter: The filter function is executed on any matching GID in the table. + * If the filter function returns true, the corresponding index is returned, + * otherwise, we continue searching the GID table. It's guaranteed that + * while filter is executed, ndev field is valid and the structure won't + * change. filter is executed in an atomic context. filter must not be NULL. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_cache_gid_find_by_filter() searches for the specified GID value + * of which the filter function returns true in the port's GID table. + * This function is only supported on RoCE ports. + * + */ +static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, + const union ib_gid *gid, + u8 port, + bool (*filter)(const union ib_gid *, + const struct ib_gid_attr *, + void *), + void *context, + u16 *index) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; + unsigned int i; + unsigned long flags; + bool found = false; + + if (!ports_table) + return -EOPNOTSUPP; + + if (port < rdma_start_port(ib_dev) || + port > rdma_end_port(ib_dev) || + !rdma_protocol_roce(ib_dev, port)) + return -EPROTONOSUPPORT; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + struct ib_gid_attr attr; + + if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) + goto next; + + if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) + goto next; + + memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); + + if (filter(gid, &attr, context)) + found = true; + +next: + if (found) + break; + } + read_unlock_irqrestore(&table->rwlock, flags); + + if (!found) + return -ENOENT; + + if (index) + *index = i; + return 0; +} + +static struct ib_gid_table *alloc_gid_table(int sz) +{ + struct ib_gid_table *table = + kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + + if (!table) + return NULL; + + table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); + if (!table->data_vec) + goto err_free_table; + + mutex_init(&table->lock); + + table->sz = sz; + rwlock_init(&table->rwlock); + + return table; + +err_free_table: + kfree(table); + return NULL; +} + +static void release_gid_table(struct ib_gid_table *table) +{ + if (table) { + kfree(table->data_vec); + kfree(table); + } +} + +static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table) +{ + int i; + bool deleted = false; + + if (!table) + return; + + write_lock_irq(&table->rwlock); + for (i = 0; i < table->sz; ++i) { + if (memcmp(&table->data_vec[i].gid, &zgid, + sizeof(table->data_vec[i].gid))) + if (!del_gid(ib_dev, port, table, i, + table->data_vec[i].props & + GID_ATTR_FIND_MASK_DEFAULT)) + deleted = true; + } + write_unlock_irq(&table->rwlock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); +} + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum ib_cache_gid_default_mode mode) +{ + struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; + union ib_gid gid; + struct ib_gid_attr gid_attr; + struct ib_gid_attr zattr_type = zattr; + struct ib_gid_table *table; + unsigned int gid_type; + + table = ports_table[port - rdma_start_port(ib_dev)]; + + make_default_gid(ndev, &gid); + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { + int ix; + union ib_gid current_gid; + struct ib_gid_attr current_gid_attr = {}; + + if (1UL << gid_type & ~gid_type_mask) + continue; + + gid_attr.gid_type = gid_type; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + ix = find_gid(table, NULL, &gid_attr, true, + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT, + NULL); + + /* Coudn't find default GID location */ + if (WARN_ON(ix < 0)) + goto release; + + zattr_type.gid_type = gid_type; + + if (!__ib_cache_gid_get(ib_dev, port, ix, + ¤t_gid, ¤t_gid_attr) && + mode == IB_CACHE_GID_DEFAULT_MODE_SET && + !memcmp(&gid, ¤t_gid, sizeof(gid)) && + !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) + goto release; + + if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || + memcmp(¤t_gid_attr, &zattr_type, + sizeof(current_gid_attr))) { + if (del_gid(ib_dev, port, table, ix, true)) { + pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", + ix, gid.raw); + goto release; + } else { + dispatch_gid_change_event(ib_dev, port); + } + } + + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { + if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) + pr_warn("ib_cache_gid: unable to add default gid %pI6\n", + gid.raw); + else + dispatch_gid_change_event(ib_dev, port); + } + +release: + if (current_gid_attr.ndev) + dev_put(current_gid_attr.ndev); + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + } +} + +static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table) +{ + unsigned int i; + unsigned long roce_gid_type_mask; + unsigned int num_default_gids; + unsigned int current_gid = 0; + + roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + num_default_gids = hweight_long(roce_gid_type_mask); + for (i = 0; i < num_default_gids && i < table->sz; i++) { + struct ib_gid_table_entry *entry = + &table->data_vec[i]; + + entry->props |= GID_TABLE_ENTRY_DEFAULT; + current_gid = find_next_bit(&roce_gid_type_mask, + BITS_PER_LONG, + current_gid); + entry->attr.gid_type = current_gid++; + } + + return 0; +} + +static int _gid_table_setup_one(struct ib_device *ib_dev) +{ + u8 port; + struct ib_gid_table **table; + int err = 0; + + table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL); + + if (!table) { + pr_warn("failed to allocate ib gid cache for %s\n", + ib_dev->name); + return -ENOMEM; + } + + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + u8 rdma_port = port + rdma_start_port(ib_dev); + + table[port] = + alloc_gid_table( + ib_dev->port_immutable[rdma_port].gid_tbl_len); + if (!table[port]) { + err = -ENOMEM; + goto rollback_table_setup; + } + + err = gid_table_reserve_default(ib_dev, + port + rdma_start_port(ib_dev), + table[port]); + if (err) + goto rollback_table_setup; + } + + ib_dev->cache.gid_cache = table; + return 0; + +rollback_table_setup: + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), + table[port]); + release_gid_table(table[port]); + } + + kfree(table); + return err; +} + +static void gid_table_release_one(struct ib_device *ib_dev) +{ + struct ib_gid_table **table = ib_dev->cache.gid_cache; + u8 port; + + if (!table) + return; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) + release_gid_table(table[port]); + + kfree(table); + ib_dev->cache.gid_cache = NULL; +} + +static void gid_table_cleanup_one(struct ib_device *ib_dev) +{ + struct ib_gid_table **table = ib_dev->cache.gid_cache; + u8 port; + + if (!table) + return; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) + cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), + table[port]); +} + +static int gid_table_setup_one(struct ib_device *ib_dev) +{ + int err; + + err = _gid_table_setup_one(ib_dev); + + if (err) + return err; + + err = roce_rescan_device(ib_dev); + + if (err) { + gid_table_cleanup_one(ib_dev); + gid_table_release_one(ib_dev); + } + + return err; +} + +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid, + struct ib_gid_attr *gid_attr) +{ + int res; + unsigned long flags; + struct ib_gid_table **ports_table = device->cache.gid_cache; + struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&table->rwlock, flags); + res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_unlock_irqrestore(&table->rwlock, flags); + + return res; +} +EXPORT_SYMBOL(ib_get_cached_gid); + +int ib_find_cached_gid(struct ib_device *device, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev, + u8 *port_num, + u16 *index) +{ + return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); +} +EXPORT_SYMBOL(ib_find_cached_gid); + +int ib_find_gid_by_filter(struct ib_device *device, + const union ib_gid *gid, + u8 port_num, + bool (*filter)(const union ib_gid *gid, + const struct ib_gid_attr *, + void *), + void *context, u16 *index) +{ + /* Only RoCE GID table supports filter function */ + if (!rdma_cap_roce_gid_table(device, port_num) && filter) + return -EPROTONOSUPPORT; + + return ib_cache_gid_find_by_filter(device, gid, + port_num, filter, + context, index); +} +EXPORT_SYMBOL(ib_find_gid_by_filter); + +int ib_get_cached_pkey(struct ib_device *device, + u8 port_num, + int index, + u16 *pkey) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int ret = 0; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + + if (index < 0 || index >= cache->table_len) + ret = -EINVAL; + else + *pkey = cache->table[index]; + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_pkey); + +int ib_find_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + int partial_ix = -1; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { + if (cache->table[i] & 0x8000) { + *index = i; + ret = 0; + break; + } else + partial_ix = i; + } + + if (ret && partial_ix >= 0) { + *index = partial_ix; + ret = 0; + } + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_cached_pkey); + +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if (cache->table[i] == pkey) { + *index = i; + ret = 0; + break; + } + + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_exact_cached_pkey); + +int ib_get_cached_lmc(struct ib_device *device, + u8 port_num, + u8 *lmc) +{ + unsigned long flags; + int ret = 0; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + *lmc = device->cache.lmc_cache[port_num - rdma_start_port(device)]; + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_lmc); + +static void ib_cache_update(struct ib_device *device, + u8 port) +{ + struct ib_port_attr *tprops = NULL; + struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; + struct ib_gid_cache { + int table_len; + union ib_gid table[0]; + } *gid_cache = NULL; + int i; + int ret; + struct ib_gid_table *table; + struct ib_gid_table **ports_table = device->cache.gid_cache; + bool use_roce_gid_table = + rdma_cap_roce_gid_table(device, port); + + if (port < rdma_start_port(device) || port > rdma_end_port(device)) + return; + + table = ports_table[port - rdma_start_port(device)]; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return; + + ret = ib_query_port(device, port, tprops); + if (ret) { + pr_warn("ib_query_port failed (%d) for %s\n", + ret, device->name); + goto err; + } + + pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * + sizeof *pkey_cache->table, GFP_KERNEL); + if (!pkey_cache) + goto err; + + pkey_cache->table_len = tprops->pkey_tbl_len; + + if (!use_roce_gid_table) { + gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * + sizeof(*gid_cache->table), GFP_KERNEL); + if (!gid_cache) + goto err; + + gid_cache->table_len = tprops->gid_tbl_len; + } + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, pkey_cache->table + i); + if (ret) { + pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + + if (!use_roce_gid_table) { + for (i = 0; i < gid_cache->table_len; ++i) { + ret = ib_query_gid(device, port, i, + gid_cache->table + i, NULL); + if (ret) { + pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + } + + write_lock_irq(&device->cache.lock); + + old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)]; + + device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; + if (!use_roce_gid_table) { + write_lock(&table->rwlock); + for (i = 0; i < gid_cache->table_len; i++) { + modify_gid(device, port, table, i, gid_cache->table + i, + &zattr, false); + } + write_unlock(&table->rwlock); + } + + device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; + + write_unlock_irq(&device->cache.lock); + + kfree(gid_cache); + kfree(old_pkey_cache); + kfree(tprops); + return; + +err: + kfree(pkey_cache); + kfree(gid_cache); + kfree(tprops); +} + +static void ib_cache_task(struct work_struct *_work) +{ + struct ib_update_work *work = + container_of(_work, struct ib_update_work, work); + + ib_cache_update(work->device, work->port_num); + kfree(work); +} + +static void ib_cache_event(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct ib_update_work *work; + + if (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_GID_CHANGE) { + work = kmalloc(sizeof *work, GFP_ATOMIC); + if (work) { + INIT_WORK(&work->work, ib_cache_task); + work->device = event->device; + work->port_num = event->element.port_num; + queue_work(ib_wq, &work->work); + } + } +} + +int ib_cache_setup_one(struct ib_device *device) +{ + int p; + int err; + + rwlock_init(&device->cache.lock); + + device->cache.pkey_cache = + kzalloc(sizeof *device->cache.pkey_cache * + (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); + device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * + (rdma_end_port(device) - + rdma_start_port(device) + 1), + GFP_KERNEL); + if (!device->cache.pkey_cache || + !device->cache.lmc_cache) { + pr_warn("Couldn't allocate cache for %s\n", device->name); + return -ENOMEM; + } + + err = gid_table_setup_one(device); + if (err) + /* Allocated memory will be cleaned in the release function */ + return err; + + for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) + ib_cache_update(device, p + rdma_start_port(device)); + + INIT_IB_EVENT_HANDLER(&device->cache.event_handler, + device, ib_cache_event); + err = ib_register_event_handler(&device->cache.event_handler); + if (err) + goto err; + + return 0; + +err: + gid_table_cleanup_one(device); + return err; +} + +void ib_cache_release_one(struct ib_device *device) +{ + int p; + + /* + * The release function frees all the cache elements. + * This function should be called as part of freeing + * all the device's resources when the cache could no + * longer be accessed. + */ + if (device->cache.pkey_cache) + for (p = 0; + p <= rdma_end_port(device) - rdma_start_port(device); ++p) + kfree(device->cache.pkey_cache[p]); + + gid_table_release_one(device); + kfree(device->cache.pkey_cache); + kfree(device->cache.lmc_cache); +} + +void ib_cache_cleanup_one(struct ib_device *device) +{ + /* The cleanup function unregisters the event handler, + * waits for all in-progress workqueue elements and cleans + * up the GID cache. This function should be called after + * the device was removed from the devices list and all + * clients were removed, so the cache exists but is + * non-functional and shouldn't be updated anymore. + */ + ib_unregister_event_handler(&device->cache.event_handler); + flush_workqueue(ib_wq); + gid_table_cleanup_one(device); +} + +void __init ib_cache_setup(void) +{ + roce_gid_mgmt_init(); +} + +void __exit ib_cache_cleanup(void) +{ + roce_gid_mgmt_cleanup(); +} Index: sys/ofed/drivers/infiniband/core/ib_cm.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_cm.c +++ sys/ofed/drivers/infiniband/core/ib_cm.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -60,13 +59,8 @@ MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); -#ifdef pr_fmt -#undef pr_fmt -#endif -#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ - static void cm_add_one(struct ib_device *device); -static void cm_remove_one(struct ib_device *device); +static void cm_remove_one(struct ib_device *device, void *client_data); static struct ib_client cm_client = { .name = "cm", @@ -88,6 +82,8 @@ __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; + /* Sync on cm change port state */ + spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ @@ -169,6 +165,8 @@ struct ib_mad_agent *mad_agent; struct kobject port_obj; u8 port_num; + struct list_head cm_priv_prim_list; + struct list_head cm_priv_altr_list; struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; }; @@ -177,6 +175,7 @@ struct ib_device *ib_device; struct device *device; u8 ack_delay; + int going_down; struct cm_port *port[0]; }; @@ -186,8 +185,6 @@ struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; - u8 valid; - u8 smac[ETH_ALEN]; }; struct cm_work { @@ -220,13 +217,15 @@ spinlock_t lock; /* Do not acquire inside cm.lock */ struct completion comp; atomic_t refcount; + /* Number of clients sharing this ib_cm_id. Only valid for listeners. + * Protected by the cm.lock spinlock. */ + int listen_sharecount; struct ib_mad_send_buf *msg; struct cm_timewait_info *timewait_info; /* todo: use alternate port on send failure */ struct cm_av av; struct cm_av alt_av; - struct ib_cm_compare_data *compare_data; void *private_data; __be64 tid; @@ -248,6 +247,12 @@ u8 service_timeout; u8 target_ack_delay; + struct list_head prim_list; + struct list_head altr_list; + /* Indicates that the send port mad is registered and av is set */ + int prim_send_port_not_ready; + int altr_send_port_not_ready; + struct list_head work_list; atomic_t work_count; }; @@ -266,19 +271,47 @@ struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; + struct cm_av *av; + unsigned long flags, flags2; + int ret = 0; + /* don't let the port to be released till the agent is down */ + spin_lock_irqsave(&cm.state_lock, flags2); + spin_lock_irqsave(&cm.lock, flags); + if (!cm_id_priv->prim_send_port_not_ready) + av = &cm_id_priv->av; + else if (!cm_id_priv->altr_send_port_not_ready && + (cm_id_priv->alt_av.port)) + av = &cm_id_priv->alt_av; + else { + pr_info("%s: not valid CM id\n", __func__); + ret = -ENODEV; + spin_unlock_irqrestore(&cm.lock, flags); + goto out; + } + spin_unlock_irqrestore(&cm.lock, flags); + /* Make sure the port haven't released the mad yet */ mad_agent = cm_id_priv->av.port->mad_agent; - ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr); - if (IS_ERR(ah)) - return PTR_ERR(ah); + if (!mad_agent) { + pr_info("%s: not a valid MAD agent\n", __func__); + ret = -ENODEV; + goto out; + } + ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto out; + } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, - cm_id_priv->av.pkey_index, + av->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC); + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); - return PTR_ERR(m); + ret = PTR_ERR(m); + goto out; } /* Timeout set by caller if response is expected. */ @@ -288,7 +321,10 @@ atomic_inc(&cm_id_priv->refcount); m->context[0] = cm_id_priv; *msg = m; - return 0; + +out: + spin_unlock_irqrestore(&cm.state_lock, flags2); + return ret; } static int cm_alloc_response_msg(struct cm_port *port, @@ -305,7 +341,8 @@ m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC); + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { ib_destroy_ah(ah); return PTR_ERR(m); @@ -357,41 +394,29 @@ grh, &av->ah_attr); } -int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac) -{ - struct cm_id_private *cm_id_priv; - - cm_id_priv = container_of(id, struct cm_id_private, id); - - if (smac != NULL) - memcpy(cm_id_priv->av.smac, smac, sizeof(cm_id_priv->av.smac)); - - if (alt_smac != NULL) - memcpy(cm_id_priv->alt_av.smac, alt_smac, - sizeof(cm_id_priv->alt_av.smac)); - - return 0; -} -EXPORT_SYMBOL(ib_update_cm_av); - -static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) +static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, + struct cm_id_private *cm_id_priv) { struct cm_device *cm_dev; struct cm_port *port = NULL; unsigned long flags; int ret; u8 p; + struct net_device *ndev = ib_get_ndev_from_path(path); read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - &p, NULL)) { + path->gid_type, ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } } read_unlock_irqrestore(&cm.device_lock, flags); + if (ndev) + dev_put(ndev); + if (!port) return -EINVAL; @@ -401,32 +426,41 @@ return ret; av->port = port; - ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + ret = ib_init_ah_from_path(cm_dev->ib_device, port->port_num, + path, &av->ah_attr); + if (ret) + return ret; + av->timeout = path->packet_life_time + 1; - memcpy(av->smac, path->smac, sizeof(av->smac)); - av->valid = 1; - return 0; + spin_lock_irqsave(&cm.lock, flags); + if (&cm_id_priv->av == av) + list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); + else if (&cm_id_priv->alt_av == av) + list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); + else + ret = -EINVAL; + + spin_unlock_irqrestore(&cm.lock, flags); + + return ret; } static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; - int ret, id; - static int next_id; + int id; - do { - spin_lock_irqsave(&cm.lock, flags); - ret = idr_get_new_above(&cm.local_id_table, cm_id_priv, - next_id, &id); - if (!ret) - next_id = ((unsigned) id + 1) & MAX_IDR_MASK; - spin_unlock_irqrestore(&cm.lock, flags); - } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) ); + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&cm.lock, flags); + + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); + + spin_unlock_irqrestore(&cm.lock, flags); + idr_preload_end(); cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; - return ret; + return id < 0 ? id : 0; } static void cm_free_id(__be32 local_id) @@ -464,41 +498,6 @@ return cm_id_priv; } -static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask) -{ - int i; - - for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++) - ((unsigned long *) dst)[i] = ((unsigned long *) src)[i] & - ((unsigned long *) mask)[i]; -} - -static int cm_compare_data(struct ib_cm_compare_data *src_data, - struct ib_cm_compare_data *dst_data) -{ - u8 src[IB_CM_COMPARE_SIZE]; - u8 dst[IB_CM_COMPARE_SIZE]; - - if (!src_data || !dst_data) - return 0; - - cm_mask_copy(src, src_data->data, dst_data->mask); - cm_mask_copy(dst, dst_data->data, src_data->mask); - return memcmp(src, dst, IB_CM_COMPARE_SIZE); -} - -static int cm_compare_private_data(u8 *private_data, - struct ib_cm_compare_data *dst_data) -{ - u8 src[IB_CM_COMPARE_SIZE]; - - if (!dst_data) - return 0; - - cm_mask_copy(src, private_data, dst_data->mask); - return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE); -} - /* * Trivial helpers to strip endian annotation and compare; the * endianness doesn't actually matter since we just need a stable @@ -531,18 +530,14 @@ struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; __be64 service_mask = cm_id_priv->id.service_mask; - int data_cmp; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); - data_cmp = cm_compare_data(cm_id_priv->compare_data, - cur_cm_id_priv->compare_data); if ((cur_cm_id_priv->id.service_mask & service_id) == (service_mask & cur_cm_id_priv->id.service_id) && - (cm_id_priv->id.device == cur_cm_id_priv->id.device) && - !data_cmp) + (cm_id_priv->id.device == cur_cm_id_priv->id.device)) return cur_cm_id_priv; if (cm_id_priv->id.device < cur_cm_id_priv->id.device) @@ -553,8 +548,6 @@ link = &(*link)->rb_left; else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; - else if (data_cmp < 0) - link = &(*link)->rb_left; else link = &(*link)->rb_right; } @@ -564,20 +557,16 @@ } static struct cm_id_private * cm_find_listen(struct ib_device *device, - __be64 service_id, - u8 *private_data) + __be64 service_id) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; - int data_cmp; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); - data_cmp = cm_compare_private_data(private_data, - cm_id_priv->compare_data); if ((cm_id_priv->id.service_mask & service_id) == cm_id_priv->id.service_id && - (cm_id_priv->id.device == device) && !data_cmp) + (cm_id_priv->id.device == device)) return cm_id_priv; if (device < cm_id_priv->id.device) @@ -588,8 +577,6 @@ node = node->rb_left; else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; - else if (data_cmp < 0) - node = node->rb_left; else node = node->rb_right; } @@ -746,6 +733,8 @@ spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->prim_list); + INIT_LIST_HEAD(&cm_id_priv->altr_list); atomic_set(&cm_id_priv->work_count, -1); atomic_set(&cm_id_priv->refcount, 1); return &cm_id_priv->id; @@ -813,11 +802,11 @@ } } -static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id, gfp_t flags) +static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) { struct cm_timewait_info *timewait_info; - timewait_info = kzalloc(sizeof *timewait_info, flags); + timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); if (!timewait_info) return ERR_PTR(-ENOMEM); @@ -831,6 +820,11 @@ { int wait_time; unsigned long flags; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); + if (!cm_dev) + return; spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); @@ -844,8 +838,14 @@ */ cm_id_priv->id.state = IB_CM_TIMEWAIT; wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); - queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, - msecs_to_jiffies(wait_time)); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) + queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, + msecs_to_jiffies(wait_time)); + spin_unlock_irqrestore(&cm.lock, flags); + cm_id_priv->timewait_info = NULL; } @@ -873,9 +873,15 @@ spin_lock_irq(&cm_id_priv->lock); switch (cm_id->state) { case IB_CM_LISTEN: - cm_id->state = IB_CM_IDLE; spin_unlock_irq(&cm_id_priv->lock); + spin_lock_irq(&cm.lock); + if (--cm_id_priv->listen_sharecount > 0) { + /* The id is still shared. */ + cm_deref_id(cm_id_priv); + spin_unlock_irq(&cm.lock); + return; + } rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); spin_unlock_irq(&cm.lock); break; @@ -887,8 +893,14 @@ case IB_CM_SIDR_REQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); + spin_lock_irq(&cm.lock); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, + &cm.remote_sidr_table); + spin_unlock_irq(&cm.lock); break; case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT, @@ -907,7 +919,6 @@ NULL, 0, NULL, 0); } break; - case IB_CM_MRA_REQ_RCVD: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); @@ -939,12 +950,20 @@ break; } + spin_lock_irq(&cm.lock); + if (!list_empty(&cm_id_priv->altr_list) && + (!cm_id_priv->altr_send_port_not_ready)) + list_del(&cm_id_priv->altr_list); + if (!list_empty(&cm_id_priv->prim_list) && + (!cm_id_priv->prim_send_port_not_ready)) + list_del(&cm_id_priv->prim_list); + spin_unlock_irq(&cm.lock); + cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); - kfree(cm_id_priv->compare_data); kfree(cm_id_priv->private_data); kfree(cm_id_priv); } @@ -955,11 +974,23 @@ } EXPORT_SYMBOL(ib_destroy_cm_id); -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, - struct ib_cm_compare_data *compare_data) +/** + * __ib_cm_listen - Initiates listening on the specified service ID for + * connection and service ID resolution requests. + * @cm_id: Connection identifier associated with the listen request. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * @service_mask: Mask applied to service ID used to listen across a + * range of service IDs. If set to 0, the service ID is matched + * exactly. This parameter is ignored if %service_id is set to + * IB_CM_ASSIGN_SERVICE_ID. + */ +static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, + __be64 service_mask) { struct cm_id_private *cm_id_priv, *cur_cm_id_priv; - unsigned long flags; int ret = 0; service_mask = service_mask ? service_mask : ~cpu_to_be64(0); @@ -972,20 +1003,9 @@ if (cm_id->state != IB_CM_IDLE) return -EINVAL; - if (compare_data) { - cm_id_priv->compare_data = kzalloc(sizeof *compare_data, - GFP_KERNEL); - if (!cm_id_priv->compare_data) - return -ENOMEM; - cm_mask_copy(cm_id_priv->compare_data->data, - compare_data->data, compare_data->mask); - memcpy(cm_id_priv->compare_data->mask, compare_data->mask, - IB_CM_COMPARE_SIZE); - } - cm_id->state = IB_CM_LISTEN; + ++cm_id_priv->listen_sharecount; - spin_lock_irqsave(&cm.lock, flags); if (service_id == IB_CM_ASSIGN_SERVICE_ID) { cm_id->service_id = cpu_to_be64(cm.listen_service_id++); cm_id->service_mask = ~cpu_to_be64(0); @@ -994,18 +1014,95 @@ cm_id->service_mask = service_mask; } cur_cm_id_priv = cm_insert_listen(cm_id_priv); - spin_unlock_irqrestore(&cm.lock, flags); if (cur_cm_id_priv) { cm_id->state = IB_CM_IDLE; - kfree(cm_id_priv->compare_data); - cm_id_priv->compare_data = NULL; + --cm_id_priv->listen_sharecount; ret = -EBUSY; } return ret; } + +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm.lock, flags); + ret = __ib_cm_listen(cm_id, service_id, service_mask); + spin_unlock_irqrestore(&cm.lock, flags); + + return ret; +} EXPORT_SYMBOL(ib_cm_listen); +/** + * Create a new listening ib_cm_id and listen on the given service ID. + * + * If there's an existing ID listening on that same device and service ID, + * return it. + * + * @device: Device associated with the cm_id. All related communication will + * be associated with the specified device. + * @cm_handler: Callback invoked to notify the user of CM events. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * + * Callers should call ib_destroy_cm_id when done with the listener ID. + */ +struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, + ib_cm_handler cm_handler, + __be64 service_id) +{ + struct cm_id_private *cm_id_priv; + struct ib_cm_id *cm_id; + unsigned long flags; + int err = 0; + + /* Create an ID in advance, since the creation may sleep */ + cm_id = ib_create_cm_id(device, cm_handler, NULL); + if (IS_ERR(cm_id)) + return cm_id; + + spin_lock_irqsave(&cm.lock, flags); + + if (service_id == IB_CM_ASSIGN_SERVICE_ID) + goto new_id; + + /* Find an existing ID */ + cm_id_priv = cm_find_listen(device, service_id); + if (cm_id_priv) { + if (cm_id->cm_handler != cm_handler || cm_id->context) { + /* Sharing an ib_cm_id with different handlers is not + * supported */ + spin_unlock_irqrestore(&cm.lock, flags); + return ERR_PTR(-EINVAL); + } + atomic_inc(&cm_id_priv->refcount); + ++cm_id_priv->listen_sharecount; + spin_unlock_irqrestore(&cm.lock, flags); + + ib_destroy_cm_id(cm_id); + cm_id = &cm_id_priv->id; + return cm_id; + } + +new_id: + /* Use newly created ID */ + err = __ib_cm_listen(cm_id, service_id, 0); + + spin_unlock_irqrestore(&cm.lock, flags); + + if (err) { + ib_destroy_cm_id(cm_id); + return ERR_PTR(err); + } + return cm_id; +} +EXPORT_SYMBOL(ib_cm_insert_listen); + static __be64 cm_form_tid(struct cm_id_private *cm_id_priv, enum cm_msg_sequence msg_seq) { @@ -1058,7 +1155,7 @@ cm_req_set_resp_res(req_msg, param->responder_resources); cm_req_set_retry_count(req_msg, param->retry_count); cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); - cm_req_set_srq(req_msg, param->srq); + cm_req_set_srq(req_msg, param->srq); } if (pri_path->hop_limit <= 1) { @@ -1150,28 +1247,28 @@ spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_IDLE) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return -EINVAL; + ret = -EINVAL; + goto out; } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id, - GFP_ATOMIC); + id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return (PTR_ERR(cm_id_priv->timewait_info)); + ret = PTR_ERR(cm_id_priv->timewait_info); + goto out; } - ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); - if (!ret && param->alternate_path) { + ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av, + cm_id_priv); + if (ret) + goto error1; + if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, - &cm_id_priv->alt_av); - } - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); + &cm_id_priv->alt_av, cm_id_priv); + if (ret) goto error1; } - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = cm_convert_to_ms( @@ -1210,11 +1307,9 @@ spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; -error2: - cm_free_msg(cm_id_priv->msg); -error1: - kfree(cm_id_priv->timewait_info); - return ret; +error2: cm_free_msg(cm_id_priv->msg); +error1: kfree(cm_id_priv->timewait_info); +out: return ret; } EXPORT_SYMBOL(ib_send_cm_req); @@ -1254,14 +1349,6 @@ return ret; } -static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, - __be32 local_qpn, __be32 remote_qpn) -{ - return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) || - ((local_ca_guid == remote_ca_guid) && - (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); -} - static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct ib_sa_path_rec *primary_path, struct ib_sa_path_rec *alt_path) @@ -1285,6 +1372,7 @@ primary_path->packet_life_time = cm_req_get_primary_local_ack_timeout(req_msg); primary_path->packet_life_time -= (primary_path->packet_life_time > 0); + primary_path->service_id = req_msg->service_id; if (req_msg->alt_local_lid) { memset(alt_path, 0, sizeof *alt_path); @@ -1306,7 +1394,26 @@ alt_path->packet_life_time = cm_req_get_alt_local_ack_timeout(req_msg); alt_path->packet_life_time -= (alt_path->packet_life_time > 0); + alt_path->service_id = req_msg->service_id; + } +} + +static u16 cm_get_bth_pkey(struct cm_work *work) +{ + struct ib_device *ib_dev = work->port->cm_dev->ib_device; + u8 port_num = work->port->port_num; + u16 pkey_index = work->mad_recv_wc->wc->pkey_index; + u16 pkey; + int ret; + + ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); + if (ret) { + dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n", + port_num, pkey_index, ret); + return 0; } + + return pkey; } static void cm_format_req_event(struct cm_work *work, @@ -1319,6 +1426,7 @@ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.req_rcvd; param->listen_id = listen_id; + param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; if (req_msg->alt_local_lid) @@ -1501,8 +1609,7 @@ /* Find matching listen request. */ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, - req_msg->service_id, - req_msg->private_data); + req_msg->service_id); if (!listen_cm_id_priv) { cm_cleanup_timewait(cm_id_priv->timewait_info); spin_unlock_irq(&cm.lock); @@ -1553,6 +1660,8 @@ struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; + union ib_gid gid; + struct ib_gid_attr gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1567,8 +1676,7 @@ work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id, - GFP_KERNEL); + id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); goto destroy; @@ -1592,20 +1700,41 @@ cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); - /* Workarround: path in req_msg doesn't contain MAC, take it from wc */ - memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, 6); - work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); + work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit; + ret = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, + cm_id_priv->av.ah_attr.grh.sgid_index, + &gid, &gid_attr); + if (!ret) { + if (gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->if_index; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + cm_id_priv); + } if (ret) { - ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, &work->path[0].sgid); + int err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + &gid_attr); + if (!err && gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->if_index; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } if (req_msg->alt_local_lid) { - ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av); + ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, + cm_id_priv); if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, @@ -1687,7 +1816,6 @@ spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } @@ -1754,7 +1882,6 @@ spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto error; } @@ -1859,7 +1986,6 @@ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); - pr_debug("no cm_id_priv\n"); return -EINVAL; } @@ -1873,7 +1999,6 @@ default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto error; } @@ -1887,7 +2012,6 @@ spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("Failed to insert remote id\n"); goto error; } /* Check for a stale connection. */ @@ -1901,7 +2025,6 @@ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; - pr_debug("Stale connection.\n"); goto error; } spin_unlock(&cm.lock); @@ -2042,7 +2165,6 @@ cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } @@ -2112,7 +2234,6 @@ if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); - pr_debug("cm_id->state(%d) != IB_CM_DREQ_RCVD\n", cm_id->state); return -EINVAL; } @@ -2178,7 +2299,6 @@ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); - pr_debug("no cm_id_priv\n"); return -EINVAL; } @@ -2219,7 +2339,6 @@ counter[CM_DREQ_COUNTER]); goto unlock; default: - pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2323,7 +2442,6 @@ cm_enter_timewait(cm_id_priv); break; default: - pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; goto out; } @@ -2428,13 +2546,12 @@ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - cm_enter_timewait(cm_id_priv); - break; + cm_enter_timewait(cm_id_priv); + break; } /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto out; } @@ -2497,7 +2614,6 @@ break; } default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto error1; } @@ -2599,7 +2715,6 @@ counter[CM_MRA_COUNTER]); /* fall through */ default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); goto out; } @@ -2674,7 +2789,8 @@ goto out; } - ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av); + ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av, + cm_id_priv); if (ret) goto out; cm_id_priv->alt_av.timeout = @@ -2786,8 +2902,8 @@ cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); - if (cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av)) - goto unlock; + cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, + cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); @@ -2979,10 +3095,7 @@ return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - - spin_lock_irqsave(&cm_id_priv->lock, flags); - - ret = cm_init_av_by_path(param->path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv); if (ret) goto out; @@ -2999,19 +3112,21 @@ msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; + spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_IDLE) ret = ib_post_send_mad(msg, NULL); else ret = -EINVAL; if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); goto out; } cm_id->state = IB_CM_SIDR_REQ_SENT; cm_id_priv->msg = msg; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +out: return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); @@ -3027,6 +3142,8 @@ param = &work->cm_event.param.sidr_req_rcvd; param->pkey = __be16_to_cpu(sidr_req_msg->pkey); param->listen_id = listen_id; + param->service_id = sidr_req_msg->service_id; + param->bth_pkey = cm_get_bth_pkey(work); param->port = work->port->port_num; work->cm_event.private_data = &sidr_req_msg->private_data; } @@ -3066,8 +3183,7 @@ } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; cur_cm_id_priv = cm_find_listen(cm_id->device, - sidr_req_msg->service_id, - sidr_req_msg->private_data); + sidr_req_msg->service_id); if (!cur_cm_id_priv) { spin_unlock_irq(&cm.lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); @@ -3147,7 +3263,10 @@ spin_unlock_irqrestore(&cm_id_priv->lock, flags); spin_lock_irqsave(&cm.lock, flags); - rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); + } spin_unlock_irqrestore(&cm.lock, flags); return 0; @@ -3339,7 +3458,6 @@ ret = cm_timewait_handler(work); break; default: - pr_debug("work->cm_event.event: 0x%x\n", work->cm_event.event); ret = -EINVAL; break; } @@ -3353,6 +3471,11 @@ struct cm_work *work; unsigned long flags; int ret = 0; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id->device, &cm_client); + if (!cm_dev) + return -ENODEV; work = kmalloc(sizeof *work, GFP_ATOMIC); if (!work) @@ -3370,7 +3493,6 @@ ret = -EISCONN; break; default: - pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; break; } @@ -3392,7 +3514,17 @@ work->remote_id = cm_id->remote_id; work->mad_recv_wc = NULL; work->cm_event.event = IB_CM_USER_ESTABLISHED; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) { + queue_delayed_work(cm.wq, &work->work, 0); + } else { + kfree(work); + ret = -ENODEV; + } + spin_unlock_irqrestore(&cm.lock, flags); + out: return ret; } @@ -3400,7 +3532,9 @@ static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; + struct cm_av tmp_av; unsigned long flags; + int tmp_send_port_not_ready; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3409,7 +3543,14 @@ (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; + /* Swap address vector */ + tmp_av = cm_id_priv->av; cm_id_priv->av = cm_id_priv->alt_av; + cm_id_priv->alt_av = tmp_av; + /* Swap port send ready state */ + tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready; + cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready; + cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -3436,6 +3577,7 @@ EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct cm_port *port = mad_agent->context; @@ -3443,6 +3585,7 @@ enum ib_cm_event_type event; u16 attr_id; int paths = 0; + int going_down = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { case CM_REQ_ATTR_ID: @@ -3501,7 +3644,19 @@ work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; work->port = port; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!port->cm_dev->going_down) + queue_delayed_work(cm.wq, &work->work, 0); + else + going_down = 1; + spin_unlock_irq(&cm.lock); + + if (going_down) { + kfree(work); + ib_free_recv_mad(mad_recv_wc); + } } static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, @@ -3533,7 +3688,6 @@ ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3560,31 +3714,6 @@ *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; - if (!cm_id_priv->av.valid) - return -EINVAL; - if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { - qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_VID; - } - if (!is_zero_ether_addr(cm_id_priv->av.smac)) { - memcpy(qp_attr->smac, cm_id_priv->av.smac, - sizeof(qp_attr->smac)); - *qp_attr_mask |= IB_QP_SMAC; - } - if (cm_id_priv->alt_av.valid) { - if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { - qp_attr->alt_vlan_id = - cm_id_priv->alt_av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_ALT_VID; - } - if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { - memcpy(qp_attr->alt_smac, - cm_id_priv->alt_av.smac, - sizeof(qp_attr->alt_smac)); - *qp_attr_mask |= IB_QP_ALT_SMAC; - } - } - qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); @@ -3606,7 +3735,6 @@ ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3666,7 +3794,6 @@ ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3693,7 +3820,6 @@ ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: - pr_debug("qp_attr->qp_state: 0x%x\n", qp_attr->qp_state); ret = -EINVAL; break; } @@ -3701,16 +3827,6 @@ } EXPORT_SYMBOL(ib_cm_init_qp_attr); -static void cm_get_ack_delay(struct cm_device *cm_dev) -{ - struct ib_device_attr attr; - - if (ib_query_device(cm_dev->ib_device, &attr)) - cm_dev->ack_delay = 0; /* acks will rely on packet life time */ - else - cm_dev->ack_delay = attr.local_ca_ack_delay; -} - static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, char *buf) { @@ -3806,26 +3922,24 @@ struct cm_port *port; struct ib_mad_reg_req reg_req = { .mgmt_class = IB_MGMT_CLASS_CM, - .mgmt_class_version = IB_CM_CLASS_VERSION + .mgmt_class_version = IB_CM_CLASS_VERSION, }; struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; int ret; + int count = 0; u8 i; - if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB) - return; - cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) * ib_device->phys_port_cnt, GFP_KERNEL); if (!cm_dev) return; cm_dev->ib_device = ib_device; - cm_get_ack_delay(cm_dev); - + cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; + cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, "%s", ib_device->name); @@ -3836,6 +3950,9 @@ set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i <= ib_device->phys_port_cnt; i++) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = kzalloc(sizeof *port, GFP_KERNEL); if (!port) goto error1; @@ -3844,6 +3961,9 @@ port->cm_dev = cm_dev; port->port_num = i; + INIT_LIST_HEAD(&port->cm_priv_prim_list); + INIT_LIST_HEAD(&port->cm_priv_altr_list); + ret = cm_create_port_fs(port); if (ret) goto error1; @@ -3854,14 +3974,21 @@ 0, cm_send_handler, cm_recv_handler, - port); + port, + 0); if (IS_ERR(port->mad_agent)) goto error2; ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) goto error3; + + count++; } + + if (!count) + goto free; + ib_set_client_data(ib_device, &cm_client, cm_dev); write_lock_irqsave(&cm.device_lock, flags); @@ -3877,26 +4004,31 @@ port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; while (--i) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); ib_unregister_mad_agent(port->mad_agent); cm_remove_port_fs(port); } +free: device_unregister(cm_dev->device); kfree(cm_dev); } -static void cm_remove_one(struct ib_device *ib_device) +static void cm_remove_one(struct ib_device *ib_device, void *client_data) { - struct cm_device *cm_dev; + struct cm_device *cm_dev = client_data; struct cm_port *port; + struct cm_id_private *cm_id_priv; + struct ib_mad_agent *cur_mad_agent; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; int i; - cm_dev = ib_get_client_data(ib_device, &cm_client); if (!cm_dev) return; @@ -3904,13 +4036,37 @@ list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); + spin_lock_irq(&cm.lock); + cm_dev->going_down = 1; + spin_unlock_irq(&cm.lock); + for (i = 1; i <= ib_device->phys_port_cnt; i++) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); - ib_unregister_mad_agent(port->mad_agent); + /* Mark all the cm_id's as not valid */ + spin_lock_irq(&cm.lock); + list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list) + cm_id_priv->altr_send_port_not_ready = 1; + list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list) + cm_id_priv->prim_send_port_not_ready = 1; + spin_unlock_irq(&cm.lock); + /* + * We flush the queue here after the going_down set, this + * verify that no new works will be queued in the recv handler, + * after that we can call the unregister_mad_agent + */ flush_workqueue(cm.wq); + spin_lock_irq(&cm.state_lock); + cur_mad_agent = port->mad_agent; + port->mad_agent = NULL; + spin_unlock_irq(&cm.state_lock); + ib_unregister_mad_agent(cur_mad_agent); cm_remove_port_fs(port); } + device_unregister(cm_dev->device); kfree(cm_dev); } @@ -3923,6 +4079,7 @@ INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); + spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; @@ -3930,8 +4087,6 @@ cm.remote_sidr_table = RB_ROOT; idr_init(&cm.local_id_table); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); - if (!idr_pre_get(&cm.local_id_table, GFP_KERNEL)) - return -ENOMEM; INIT_LIST_HEAD(&cm.timewait_list); ret = class_register(&cm_class); Index: sys/ofed/drivers/infiniband/core/ib_cma.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_cma.c +++ sys/ofed/drivers/infiniband/core/ib_cma.c @@ -3,7 +3,6 @@ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -45,71 +44,67 @@ #include #include #include -#include #include #include #include +#include +#include + #include #include +#include +#include #include #include #include #include +#include + +#include "core_priv.h" + MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 +#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 18 -static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT; -module_param_named(cma_response_timeout, cma_response_timeout, int, 0644); -MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT (default=20)"); - -static int def_prec2sl = 3; -module_param_named(def_prec2sl, def_prec2sl, int, 0644); -MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); - -static int unify_tcp_port_space = 1; -module_param(unify_tcp_port_space, int, 0644); -MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " - "space allocation (default=1)"); - -static int debug_level = 0; -#define cma_pr(level, priv, format, arg...) \ - printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg) - -#define cma_dbg(priv, format, arg...) \ - do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0) - -#define cma_warn(priv, format, arg...) \ - cma_pr(KERN_WARNING, priv, format, ## arg) - -#define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x" -#define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\ - ((u8 *)(gid))[13],\ - ((u8 *)(gid))[14],\ - ((u8 *)(gid))[15] - -#define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw) -#define cma_debug_path(priv, pfx, p) \ - cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" \ - CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), \ - CMA_GID_ARG(p.dgid)) +static const char * const cma_events[] = { + [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", + [RDMA_CM_EVENT_ADDR_ERROR] = "address error", + [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", + [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", + [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", + [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", + [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", + [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", + [RDMA_CM_EVENT_REJECTED] = "rejected", + [RDMA_CM_EVENT_ESTABLISHED] = "established", + [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", + [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", + [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", + [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", + [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", + [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", +}; -#define cma_debug_gid(priv, g) \ - cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g) +const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) +{ + size_t index = event; -module_param_named(debug_level, debug_level, int, 0644); -MODULE_PARM_DESC(debug_level, "debug level default=0"); + return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? + cma_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(rdma_event_msg); static void cma_add_one(struct ib_device *device); -static void cma_remove_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", @@ -123,12 +118,44 @@ static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; -static struct workqueue_struct *cma_free_wq; -static DEFINE_IDR(sdp_ps); -static DEFINE_IDR(tcp_ps); -static DEFINE_IDR(udp_ps); -static DEFINE_IDR(ipoib_ps); -static DEFINE_IDR(ib_ps); + +struct cma_pernet { + struct idr tcp_ps; + struct idr udp_ps; + struct idr ipoib_ps; + struct idr ib_ps; +}; + +VNET_DEFINE(struct cma_pernet, cma_pernet); + +static struct cma_pernet *cma_pernet_ptr(struct vnet *vnet) +{ + struct cma_pernet *retval; + + CURVNET_SET_QUIET(vnet); + retval = &VNET(cma_pernet); + CURVNET_RESTORE(); + + return (retval); +} + +static struct idr *cma_pernet_idr(struct vnet *net, enum rdma_port_space ps) +{ + struct cma_pernet *pernet = cma_pernet_ptr(net); + + switch (ps) { + case RDMA_PS_TCP: + return &pernet->tcp_ps; + case RDMA_PS_UDP: + return &pernet->udp_ps; + case RDMA_PS_IPOIB: + return &pernet->ipoib_ps; + case RDMA_PS_IB: + return &pernet->ib_ps; + default: + return NULL; + } +} struct cma_device { struct list_head list; @@ -136,18 +163,112 @@ struct completion comp; atomic_t refcount; struct list_head id_list; + struct sysctl_ctx_list sysctl_ctx; + enum ib_gid_type *default_gid_type; }; struct rdma_bind_list { - struct idr *ps; + enum rdma_port_space ps; struct hlist_head owners; unsigned short port; }; +struct class_port_info_context { + struct ib_class_port_info *class_port_info; + struct ib_device *device; + struct completion done; + struct ib_sa_query *sa_query; + u8 port_num; +}; + +static int cma_ps_alloc(struct vnet *vnet, enum rdma_port_space ps, + struct rdma_bind_list *bind_list, int snum) +{ + struct idr *idr = cma_pernet_idr(vnet, ps); + + return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); +} + +static struct rdma_bind_list *cma_ps_find(struct vnet *net, + enum rdma_port_space ps, int snum) +{ + struct idr *idr = cma_pernet_idr(net, ps); + + return idr_find(idr, snum); +} + +static void cma_ps_remove(struct vnet *net, enum rdma_port_space ps, int snum) +{ + struct idr *idr = cma_pernet_idr(net, ps); + + idr_remove(idr, snum); +} + enum { CMA_OPTION_AFONLY, }; +void cma_ref_dev(struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); +} + +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie) +{ + struct cma_device *cma_dev; + struct cma_device *found_cma_dev = NULL; + + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) + if (filter(cma_dev->device, cookie)) { + found_cma_dev = cma_dev; + break; + } + + if (found_cma_dev) + cma_ref_dev(found_cma_dev); + mutex_unlock(&lock); + return found_cma_dev; +} + +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port) +{ + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type) +{ + unsigned long supported_gids; + + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); + + if (!(supported_gids & 1 << default_gid_type)) + return -EINVAL; + + cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = + default_gid_type; + + return 0; +} + +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) +{ + return cma_dev->device; +} + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -158,7 +279,6 @@ struct rdma_cm_id id; struct rdma_bind_list *bind_list; - struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ @@ -168,13 +288,11 @@ int internal_id; enum rdma_cm_state state; spinlock_t lock; - spinlock_t cm_lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; - struct work_struct work; /* garbage coll */ int backlog; int timeout_ms; @@ -194,10 +312,7 @@ u8 tos; u8 reuseaddr; u8 afonly; - int qp_timeout; - /* cache for mc record params */ - struct ib_sa_mcmember_rec rec; - int is_valid_rec; + enum ib_gid_type gid_type; }; struct cma_multicast { @@ -209,6 +324,8 @@ void *context; struct sockaddr_storage addr; struct kref mcref; + bool igmp_joined; + u8 join_state; }; struct cma_work { @@ -247,25 +364,17 @@ union cma_ip_addr dst_addr; }; -struct sdp_hh { - u8 bsdh[16]; - u8 sdp_version; /* Major version: 7:4 */ - u8 ip_version; /* IP version: 7:4 */ - u8 sdp_specific1[10]; - __be16 port; - __be16 sdp_specific2; - union cma_ip_addr src_addr; - union cma_ip_addr dst_addr; -}; +#define CMA_VERSION 0x00 -struct sdp_hah { - u8 bsdh[16]; - u8 sdp_version; +struct cma_req_info { + struct ib_device *device; + int port; + union ib_gid local_gid; + __be64 service_id; + u16 pkey; + bool has_gid:1; }; -#define CMA_VERSION 0x00 -#define SDP_MAJ_VERSION 0x2 - static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) { unsigned long flags; @@ -303,7 +412,7 @@ return old; } -static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } @@ -313,33 +422,28 @@ hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static inline u8 sdp_get_majv(u8 sdp_version) -{ - return sdp_version >> 4; -} - -static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) -{ - return hh->ip_version >> 4; -} - -static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) -{ - hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); -} - -static void cma_attach_to_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) +static void _cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) { - atomic_inc(&cma_dev->refcount); + cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; + id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } -static inline void cma_deref_dev(struct cma_device *cma_dev) +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + _cma_attach_to_dev(id_priv, cma_dev); + id_priv->gid_type = + cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(cma_dev->device)]; +} + +void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); @@ -362,16 +466,40 @@ mutex_unlock(&lock); } -static int cma_set_qkey(struct rdma_id_private *id_priv) +static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; +} + +static inline unsigned short cma_family(struct rdma_id_private *id_priv) +{ + return id_priv->id.route.addr.src_addr.ss_family; +} + +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) + if (id_priv->qkey) { + if (qkey && id_priv->qkey != qkey) + return -EINVAL; return 0; + } + + if (qkey) { + id_priv->qkey = qkey; + return 0; + } switch (id_priv->id.ps) { case RDMA_PS_UDP: + case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: @@ -388,109 +516,78 @@ return ret; } -static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) +static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { - int i; - int err; - struct ib_port_attr props; - union ib_gid tmp; + dev_addr->dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); + ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); +} - err = ib_query_port(device, port_num, &props); - if (err) - return 1; +static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + int ret; - for (i = 0; i < props.gid_tbl_len; ++i) { - err = ib_query_gid(device, port_num, i, &tmp); - if (err) - return 1; - if (!memcmp(&tmp, gid, sizeof tmp)) - return 0; + if (addr->sa_family != AF_IB) { + ret = rdma_translate_ip(addr, dev_addr, NULL); + } else { + cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); + ret = 0; } - return -EAGAIN; + return ret; } -int -rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type, - void **cm_id) +static inline int cma_validate_port(struct ib_device *device, u8 port, + enum ib_gid_type gid_type, + union ib_gid *gid, int dev_type, + struct vnet *net, + int bound_if_index) { - int ret; - u8 port; - int found_dev = 0, found_cmid = 0; - struct rdma_id_private *id_priv; - struct rdma_id_private *dev_id_priv; - struct cma_device *cma_dev; - struct rdma_dev_addr dev_addr; - union ib_gid gid; - enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ? - IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + int ret = -ENODEV; + struct net_device *ndev = NULL; - memset(&dev_addr, 0, sizeof(dev_addr)); + if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) + return ret; - ret = rdma_translate_ip((struct sockaddr *)local_addr, - &dev_addr, NULL); - if (ret) - goto err; + if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) + return ret; - /* find rdma device based on MAC address/gid */ - mutex_lock(&lock); + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(net, bound_if_index); + if (ndev && ndev->if_flags & IFF_LOOPBACK) { + pr_info("detected loopback device\n"); + dev_put(ndev); - memcpy(&gid, dev_addr.src_dev_addr + - rdma_addr_gid_offset(&dev_addr), sizeof(gid)); + if (!device->get_netdev) + return -EOPNOTSUPP; - list_for_each_entry(cma_dev, &dev_list, list) - for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) - if ((rdma_port_get_link_layer(cma_dev->device, port) == - dev_ll) && - (rdma_node_get_transport(cma_dev->device->node_type) == - RDMA_TRANSPORT_IWARP)) { - ret = find_gid_port(cma_dev->device, - &gid, port); - if (!ret) { - found_dev = 1; - goto out; - } else if (ret == 1) { - mutex_unlock(&lock); - goto err; - } - } -out: - mutex_unlock(&lock); + ndev = device->get_netdev(device, port); + if (!ndev) + return -ENODEV; + } + } else { + gid_type = IB_GID_TYPE_IB; + } - if (!found_dev) - goto err; + ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, + ndev, NULL); - /* Traverse through the list of listening cm_id's to find the - * desired cm_id based on rdma device & port number. - */ - list_for_each_entry(id_priv, &listen_any_list, list) - list_for_each_entry(dev_id_priv, &id_priv->listen_list, - listen_list) - if (dev_id_priv->cma_dev == cma_dev) - if (dev_id_priv->cm_id.iw->local_addr.sin_port - == local_addr->sin_port) { - *cm_id = (void *)dev_id_priv->cm_id.iw; - found_cmid = 1; - } - return found_cmid ? 0 : -ENODEV; + if (ndev) + dev_put(ndev); -err: - return -ENODEV; + return ret; } -EXPORT_SYMBOL(rdma_find_cmid_laddr); static int cma_acquire_dev(struct rdma_id_private *id_priv, struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid, iboe_gid; + union ib_gid gid, iboe_gid, *gidp; int ret = -ENODEV; - u8 port, found_port; - enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? - IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + u8 port; - if (dev_ll != IB_LINK_LAYER_INFINIBAND && + if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; @@ -500,42 +597,46 @@ memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); - if (listen_id_priv && - rdma_port_get_link_layer(listen_id_priv->id.device, - listen_id_priv->id.port_num) == dev_ll) { + + if (listen_id_priv) { cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, - &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, - &found_port, NULL); - - if (!ret && (port == found_port)) { - id_priv->id.port_num = found_port; + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + listen_id_priv->gid_type, gidp, + dev_addr->dev_type, + dev_addr->net, + dev_addr->bound_dev_if); + if (!ret) { + id_priv->id.port_num = port; goto out; } } + list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { if (listen_id_priv && listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; - if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); - - if (!ret && (port == found_port)) { - id_priv->id.port_num = port; - goto out; - } else if (ret == 1) - break; + + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + cma_dev->default_gid_type[port - 1], + gidp, dev_addr->dev_type, + dev_addr->net, + dev_addr->bound_dev_if); + if (!ret) { + id_priv->id.port_num = port; + goto out; } } } @@ -548,24 +649,70 @@ return ret; } -static void cma_deref_id(struct rdma_id_private *id_priv) +/* + * Select the source IB device and address to reach the destination IB address. + */ +static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { - if (atomic_dec_and_test(&id_priv->refcount)) - complete(&id_priv->comp); -} + struct cma_device *cma_dev, *cur_dev; + struct sockaddr_ib *addr; + union ib_gid gid, sgid, *dgid; + u16 pkey, index; + u8 p; + int i; -static int cma_disable_callback(struct rdma_id_private *id_priv, - enum rdma_cm_state state) -{ - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != state) { - mutex_unlock(&id_priv->handler_mutex); - return -EINVAL; + cma_dev = NULL; + addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); + dgid = (union ib_gid *) &addr->sib_addr; + pkey = ntohs(addr->sib_pkey); + + list_for_each_entry(cur_dev, &dev_list, list) { + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!rdma_cap_af_ib(cur_dev->device, p)) + continue; + + if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) + continue; + + for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, + &gid, NULL); + i++) { + if (!memcmp(&gid, dgid, sizeof(gid))) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + goto found; + } + + if (!cma_dev && (gid.global.subnet_prefix == + dgid->global.subnet_prefix)) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + } + } + } } + + if (!cma_dev) + return -ENODEV; + +found: + cma_attach_to_dev(id_priv, cma_dev); + addr = (struct sockaddr_ib *) cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof sgid); + cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } -struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, +static void cma_deref_id(struct rdma_id_private *id_priv) +{ + if (atomic_dec_and_test(&id_priv->refcount)) + complete(&id_priv->comp); +} + +struct rdma_cm_id *rdma_create_id(struct vnet *net, + rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type) { @@ -575,14 +722,13 @@ if (!id_priv) return ERR_PTR(-ENOMEM); - id_priv->owner = curthread->td_proc->p_pid; + id_priv->owner = task_pid_nr(current); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); - spin_lock_init(&id_priv->cm_lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); @@ -590,6 +736,7 @@ INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); + id_priv->id.route.addr.dev_addr.net = TD_TO_VNET(curthread); return &id_priv->id; } @@ -645,6 +792,7 @@ if (id->device != pd->device) return -EINVAL; + qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); @@ -705,23 +853,13 @@ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; + ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, - qp_attr.ah_attr.grh.sgid_index, &sgid); + qp_attr.ah_attr.grh.sgid_index, &sgid, NULL); if (ret) goto out; - if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) - == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) - == IB_LINK_LAYER_ETHERNET) { - u32 scope_id = rdma_get_ipv6_scope_id(id_priv->id.device, - id_priv->id.port_num); - - ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL, - scope_id); - if (ret) - goto out; - } + BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; @@ -750,12 +888,6 @@ if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; - - if (id_priv->qp_timeout && id_priv->id.qp->qp_type == IB_QPT_RC) { - qp_attr.timeout = id_priv->qp_timeout; - qp_attr_mask |= IB_QP_TIMEOUT; - } - ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); @@ -787,11 +919,10 @@ int ret; u16 pkey; - if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == - IB_LINK_LAYER_INFINIBAND) - pkey = ib_addr_get_pkey(dev_addr); - else + if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; + else + pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); @@ -802,7 +933,7 @@ *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, 0); if (ret) return ret; @@ -822,29 +953,24 @@ int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); - break; - default: + } else ret = -ENOSYS; - break; - } return ret; } @@ -852,38 +978,36 @@ static inline int cma_zero_addr(struct sockaddr *addr) { - struct in6_addr *ip6; - - if (addr->sa_family == AF_INET) - return ipv4_is_zeronet( - ((struct sockaddr_in *)addr)->sin_addr.s_addr); - else { - ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; - return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | - ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) - return ipv4_is_loopback( - ((struct sockaddr_in *) addr)->sin_addr.s_addr); - else - return ipv6_addr_loopback( - &((struct sockaddr_in6 *) addr)->sin6_addr); + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; + } } static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } -int -rdma_cma_any_addr(struct sockaddr *addr) -{ - return cma_any_addr(addr); -} -EXPORT_SYMBOL(rdma_cma_any_addr); static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) { @@ -894,18 +1018,31 @@ case AF_INET: return ((struct sockaddr_in *) src)->sin_addr.s_addr != ((struct sockaddr_in *) dst)->sin_addr.s_addr; - default: + case AF_INET6: return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); + default: + return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, + &((struct sockaddr_ib *) dst)->sib_addr); } } -static inline __be16 cma_port(struct sockaddr *addr) +static __be16 cma_port(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) + struct sockaddr_ib *sib; + + switch (addr->sa_family) { + case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; - else + case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + return htons((u16) (be64_to_cpu(sib->sib_sid) & + be64_to_cpu(sib->sib_sid_mask))); + default: + return 0; + } } static inline int cma_any_port(struct sockaddr *addr) @@ -913,100 +1050,467 @@ return !cma_port(addr); } -static int cma_get_net_info(void *hdr, enum rdma_port_space ps, - u8 *ip_ver, __be16 *port, - union cma_ip_addr **src, union cma_ip_addr **dst) +static void cma_save_ib_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_cm_id *listen_id, + struct ib_sa_path_rec *path) { - switch (ps) { - case RDMA_PS_SDP: - if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != - SDP_MAJ_VERSION) - return -EINVAL; + struct sockaddr_ib *listen_ib, *ib; + + listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; + if (src_addr) { + ib = (struct sockaddr_ib *)src_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->sgid, 16); + ib->sib_sid = path->service_id; + ib->sib_scope_id = 0; + } else { + ib->sib_pkey = listen_ib->sib_pkey; + ib->sib_flowinfo = listen_ib->sib_flowinfo; + ib->sib_addr = listen_ib->sib_addr; + ib->sib_sid = listen_ib->sib_sid; + ib->sib_scope_id = listen_ib->sib_scope_id; + } + ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); + } + if (dst_addr) { + ib = (struct sockaddr_ib *)dst_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->dgid, 16); + } + } +} - *ip_ver = sdp_get_ip_ver(hdr); - *port = ((struct sdp_hh *) hdr)->port; - *src = &((struct sdp_hh *) hdr)->src_addr; - *dst = &((struct sdp_hh *) hdr)->dst_addr; - break; - default: - if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) - return -EINVAL; +static void cma_save_ip4_info(struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) +{ + if (src_addr) { + *src_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->dst_addr.ip4.addr, + .sin_port = local_port, + }; + } - *ip_ver = cma_get_ip_ver(hdr); - *port = ((struct cma_hdr *) hdr)->port; - *src = &((struct cma_hdr *) hdr)->src_addr; - *dst = &((struct cma_hdr *) hdr)->dst_addr; - break; + if (dst_addr) { + *dst_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->src_addr.ip4.addr, + .sin_port = hdr->port, + }; } +} - if (*ip_ver != 4 && *ip_ver != 6) - return -EINVAL; - return 0; +static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, + struct sockaddr_in6 *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) +{ + if (src_addr) { + *src_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->dst_addr.ip6, + .sin6_port = local_port, + }; + } + + if (dst_addr) { + *dst_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->src_addr.ip6, + .sin6_port = hdr->port, + }; + } +} + +static u16 cma_port_from_service_id(__be64 service_id) +{ + return (u16)be64_to_cpu(service_id); } -static void cma_save_net_info(struct rdma_addr *addr, - struct rdma_addr *listen_addr, - u8 ip_ver, __be16 port, - union cma_ip_addr *src, union cma_ip_addr *dst) +static int cma_save_ip_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct ib_cm_event *ib_event, + __be64 service_id) { - struct sockaddr_in *listen4, *ip4; - struct sockaddr_in6 *listen6, *ip6; + struct cma_hdr *hdr; + __be16 port; + + hdr = ib_event->private_data; + if (hdr->cma_version != CMA_VERSION) + return -EINVAL; - switch (ip_ver) { + port = htons(cma_port_from_service_id(service_id)); + + switch (cma_get_ip_ver(hdr)) { case 4: - listen4 = (struct sockaddr_in *) &listen_addr->src_addr; - ip4 = (struct sockaddr_in *) &addr->src_addr; - ip4->sin_family = listen4->sin_family; - ip4->sin_addr.s_addr = dst->ip4.addr; - ip4->sin_port = listen4->sin_port; - ip4->sin_len = sizeof(struct sockaddr_in); - - ip4 = (struct sockaddr_in *) &addr->dst_addr; - ip4->sin_family = listen4->sin_family; - ip4->sin_addr.s_addr = src->ip4.addr; - ip4->sin_port = port; - ip4->sin_len = sizeof(struct sockaddr_in); + cma_save_ip4_info((struct sockaddr_in *)src_addr, + (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: - listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; - ip6 = (struct sockaddr_in6 *) &addr->src_addr; - ip6->sin6_family = listen6->sin6_family; - ip6->sin6_addr = dst->ip6; - ip6->sin6_port = listen6->sin6_port; - ip6->sin6_len = sizeof(struct sockaddr_in6); - ip6->sin6_scope_id = listen6->sin6_scope_id; - - ip6 = (struct sockaddr_in6 *) &addr->dst_addr; - ip6->sin6_family = listen6->sin6_family; - ip6->sin6_addr = src->ip6; - ip6->sin6_port = port; - ip6->sin6_len = sizeof(struct sockaddr_in6); - ip6->sin6_scope_id = listen6->sin6_scope_id; + cma_save_ip6_info((struct sockaddr_in6 *)src_addr, + (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: - break; + return -EAFNOSUPPORT; } + + return 0; } -static inline int cma_user_data_offset(enum rdma_port_space ps) +static int cma_save_net_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event, + sa_family_t sa_family, __be64 service_id) { - switch (ps) { - case RDMA_PS_SDP: + if (sa_family == AF_IB) { + if (ib_event->event == IB_CM_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, + ib_event->param.req_rcvd.primary_path); + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; + } + + return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); +} + +static int cma_save_req_info(const struct ib_cm_event *ib_event, + struct cma_req_info *req) +{ + const struct ib_cm_req_event_param *req_param = + &ib_event->param.req_rcvd; + const struct ib_cm_sidr_req_event_param *sidr_param = + &ib_event->param.sidr_req_rcvd; + + switch (ib_event->event) { + case IB_CM_REQ_RECEIVED: + req->device = req_param->listen_id->device; + req->port = req_param->port; + memcpy(&req->local_gid, &req_param->primary_path->sgid, + sizeof(req->local_gid)); + req->has_gid = true; + req->service_id = req_param->primary_path->service_id; + req->pkey = be16_to_cpu(req_param->primary_path->pkey); + if (req->pkey != req_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + req_param->bth_pkey, req->pkey); + break; + case IB_CM_SIDR_REQ_RECEIVED: + req->device = sidr_param->listen_id->device; + req->port = sidr_param->port; + req->has_gid = false; + req->service_id = sidr_param->service_id; + req->pkey = sidr_param->pkey; + if (req->pkey != sidr_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + sidr_param->bth_pkey, req->pkey); + break; + default: + return -EINVAL; + } + + return 0; +} + +static bool validate_ipv4_net_dev(struct net_device *net_dev, + const struct sockaddr_in *dst_addr, + const struct sockaddr_in *src_addr) +{ +#ifdef INET + struct sockaddr_in dst_tmp = *dst_addr; + __be32 daddr = dst_addr->sin_addr.s_addr, + saddr = src_addr->sin_addr.s_addr; + struct net_device *src_dev; + struct rtentry *rte; + bool ret; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || + ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || + ipv4_is_loopback(saddr)) + return false; + + src_dev = ip_dev_find(net_dev->if_vnet, saddr); + if (src_dev != net_dev) + return false; + + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin_len = sizeof(dst_tmp); + + CURVNET_SET(net_dev->if_vnet); + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + if (rte != NULL) { + ret = (rte->rt_ifp == net_dev); + RTFREE_LOCKED(rte); + } else { + ret = false; + } + return ret; +#else + return false; +#endif +} + +static bool validate_ipv6_net_dev(struct net_device *net_dev, + const struct sockaddr_in6 *dst_addr, + const struct sockaddr_in6 *src_addr) +{ +#ifdef INET6 + struct sockaddr_in6 dst_tmp = *dst_addr; + struct in6_addr in6_addr = src_addr->sin6_addr; + struct net_device *src_dev; + struct rtentry *rte; + bool ret; + + /* embed scope ID */ + in6_addr.s6_addr[3] = src_addr->sin6_scope_id; + + src_dev = ip6_dev_find(net_dev->if_vnet, in6_addr); + if (src_dev != net_dev) + return false; + + /* + * Make sure the socket address length field + * is set, else rtalloc1() will fail. + */ + dst_tmp.sin6_len = sizeof(dst_tmp); + + CURVNET_SET(net_dev->if_vnet); + rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0); + CURVNET_RESTORE(); + if (rte != NULL) { + ret = (rte->rt_ifp == net_dev); + RTFREE_LOCKED(rte); + } else { + ret = false; + } + return ret; +#else + return false; +#endif +} + +static bool validate_net_dev(struct net_device *net_dev, + const struct sockaddr *daddr, + const struct sockaddr *saddr) +{ + const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; + const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; + + switch (daddr->sa_family) { + case AF_INET: + return saddr->sa_family == AF_INET && + validate_ipv4_net_dev(net_dev, daddr4, saddr4); + + case AF_INET6: + return saddr->sa_family == AF_INET6 && + validate_ipv6_net_dev(net_dev, daddr6, saddr6); + + default: + return false; + } +} + +static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, + const struct cma_req_info *req) +{ + struct sockaddr_storage listen_addr_storage, src_addr_storage; + struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage, + *src_addr = (struct sockaddr *)&src_addr_storage; + struct net_device *net_dev; + const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; + int err; + + err = cma_save_ip_info(listen_addr, src_addr, ib_event, + req->service_id); + if (err) + return ERR_PTR(err); + + net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, + gid, listen_addr); + if (!net_dev) + return ERR_PTR(-ENODEV); + + if (!validate_net_dev(net_dev, listen_addr, src_addr)) { + dev_put(net_dev); + return ERR_PTR(-EHOSTUNREACH); + } + + return net_dev; +} + +static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id) +{ + return (be64_to_cpu(service_id) >> 16) & 0xffff; +} + +static bool cma_match_private_data(struct rdma_id_private *id_priv, + const struct cma_hdr *hdr) +{ + struct sockaddr *addr = cma_src_addr(id_priv); + __be32 ip4_addr; + struct in6_addr ip6_addr; + + if (cma_any_addr(addr) && !id_priv->afonly) + return true; + + switch (addr->sa_family) { + case AF_INET: + ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; + if (cma_get_ip_ver(hdr) != 4) + return false; + if (!cma_any_addr(addr) && + hdr->dst_addr.ip4.addr != ip4_addr) + return false; + break; + case AF_INET6: + ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; + if (cma_get_ip_ver(hdr) != 6) + return false; + if (!cma_any_addr(addr) && + memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) + return false; + break; + case AF_IB: + return true; default: - return sizeof(struct cma_hdr); + return false; } + + return true; +} + +static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num) +{ + enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num); + enum rdma_transport_type transport = + rdma_node_get_transport(device->node_type); + + return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB; +} + +static bool cma_protocol_roce(const struct rdma_cm_id *id) +{ + struct ib_device *device = id->device; + const int port_num = id->port_num ?: rdma_start_port(device); + + return cma_protocol_roce_dev_port(device, port_num); +} + +static bool cma_match_net_dev(const struct rdma_cm_id *id, + const struct net_device *net_dev, + u8 port_num) +{ + const struct rdma_addr *addr = &id->route.addr; + + if (!net_dev) + /* This request is an AF_IB request or a RoCE request */ + return (!id->port_num || id->port_num == port_num) && + (addr->src_addr.ss_family == AF_IB || + cma_protocol_roce_dev_port(id->device, port_num)); + + return !addr->dev_addr.bound_dev_if || + (net_eq(dev_net(net_dev), addr->dev_addr.net) && + addr->dev_addr.bound_dev_if == net_dev->if_index); +} + +static struct rdma_id_private *cma_find_listener( + const struct rdma_bind_list *bind_list, + const struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event, + const struct cma_req_info *req, + const struct net_device *net_dev) +{ + struct rdma_id_private *id_priv, *id_priv_dev; + + if (!bind_list) + return ERR_PTR(-EINVAL); + + hlist_for_each_entry(id_priv, &bind_list->owners, node) { + if (cma_match_private_data(id_priv, ib_event->private_data)) { + if (id_priv->id.device == cm_id->device && + cma_match_net_dev(&id_priv->id, net_dev, req->port)) + return id_priv; + list_for_each_entry(id_priv_dev, + &id_priv->listen_list, + listen_list) { + if (id_priv_dev->id.device == cm_id->device && + cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + return id_priv_dev; + } + } + } + + return ERR_PTR(-EINVAL); +} + +static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, + struct ib_cm_event *ib_event, + struct net_device **net_dev) +{ + struct cma_req_info req; + struct rdma_bind_list *bind_list; + struct rdma_id_private *id_priv; + int err; + + err = cma_save_req_info(ib_event, &req); + if (err) + return ERR_PTR(err); + + *net_dev = cma_get_net_dev(ib_event, &req); + if (IS_ERR(*net_dev)) { + if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { + /* Assuming the protocol is AF_IB */ + *net_dev = NULL; + } else if (cma_protocol_roce_dev_port(req.device, req.port)) { + /* TODO find the net dev matching the request parameters + * through the RoCE GID table */ + *net_dev = NULL; + } else { + return ERR_CAST(*net_dev); + } + } + + bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, + rdma_ps_from_service_id(req.service_id), + cma_port_from_service_id(req.service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + if (IS_ERR(id_priv) && *net_dev) { + dev_put(*net_dev); + *net_dev = NULL; + } + + return id_priv; +} + +static inline int cma_user_data_offset(struct rdma_id_private *id_priv) +{ + return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { - switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { - case IB_LINK_LAYER_INFINIBAND: + if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); - break; - default: - break; } } @@ -1046,8 +1550,7 @@ cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: - if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) - && !id_priv->cma_dev) + if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: @@ -1057,18 +1560,16 @@ static void cma_release_port(struct rdma_id_private *id_priv) { - struct rdma_bind_list *bind_list; + struct rdma_bind_list *bind_list = id_priv->bind_list; + struct vnet *net = id_priv->id.route.addr.dev_addr.net; - mutex_lock(&lock); - bind_list = id_priv->bind_list; - if (!bind_list) { - mutex_unlock(&lock); + if (!bind_list) return; - } + + mutex_lock(&lock); hlist_del(&id_priv->node); - id_priv->bind_list = NULL; if (hlist_empty(&bind_list->owners)) { - idr_remove(bind_list->ps, bind_list->port); + cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); @@ -1082,39 +1583,32 @@ mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); - switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { - case IB_LINK_LAYER_INFINIBAND: + if (rdma_cap_ib_mcast(id_priv->cma_dev->device, + id_priv->id.port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - break; - case IB_LINK_LAYER_ETHERNET: + } else { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev) { + dev_put(ndev); + } + } kref_put(&mc->mcref, release_mc); - break; - default: - break; } } } -static void __rdma_free(struct work_struct *work) -{ - struct rdma_id_private *id_priv; - id_priv = container_of(work, struct rdma_id_private, work); - - wait_for_completion(&id_priv->comp); - - if (id_priv->internal_id) - cma_deref_id(id_priv->id.context); - - kfree(id_priv->id.route.path_rec); - kfree(id_priv); -} void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; enum rdma_cm_state state; - unsigned long flags; - struct ib_cm_id *ib; id_priv = container_of(id, struct rdma_id_private, id); state = cma_exch(id_priv, RDMA_CM_DESTROYING); @@ -1128,24 +1622,12 @@ mutex_unlock(&id_priv->handler_mutex); if (id_priv->cma_dev) { - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: - spin_lock_irqsave(&id_priv->cm_lock, flags); - if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) { - ib = id_priv->cm_id.ib; - id_priv->cm_id.ib = NULL; - spin_unlock_irqrestore(&id_priv->cm_lock, flags); - ib_destroy_cm_id(ib); - } else - spin_unlock_irqrestore(&id_priv->cm_lock, flags); - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + if (rdma_cap_ib_cm(id_priv->id.device, 1)) { + if (id_priv->cm_id.ib) + ib_destroy_cm_id(id_priv->cm_id.ib); + } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); - break; - default: - break; } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); @@ -1153,8 +1635,13 @@ cma_release_port(id_priv); cma_deref_id(id_priv); - INIT_WORK(&id_priv->work, __rdma_free); - queue_work(cma_free_wq, &id_priv->work); + wait_for_completion(&id_priv->comp); + + if (id_priv->internal_id) + cma_deref_id(id_priv->id.context); + + kfree(id_priv->id.route.path_rec); + kfree(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); @@ -1170,7 +1657,6 @@ if (ret) goto reject; - cma_dbg(id_priv, "sending RTU\n"); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; @@ -1178,22 +1664,11 @@ return 0; reject: cma_modify_qp_err(id_priv); - cma_dbg(id_priv, "sending REJ\n"); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } -static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) -{ - if (id_priv->id.ps == RDMA_PS_SDP && - sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != - SDP_MAJ_VERSION) - return -EINVAL; - - return 0; -} - static void cma_set_rep_event_data(struct rdma_cm_event *event, struct ib_cm_rep_event_param *rep_data, void *private_data) @@ -1214,11 +1689,13 @@ struct rdma_cm_event event; int ret = 0; + mutex_lock(&id_priv->handler_mutex); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || + id_priv->state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) - return 0; + id_priv->state != RDMA_CM_DISCONNECT)) + goto out; + memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: @@ -1227,15 +1704,13 @@ event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: - event.status = cma_verify_rep(id_priv, ib_event->private_data); - if (event.status) - event.event = RDMA_CM_EVENT_CONNECT_ERROR; - else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { + if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; - } else + } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; @@ -1266,7 +1741,7 @@ event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -1286,27 +1761,28 @@ } static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event) + struct ib_cm_event *ib_event, + struct net_device *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; - union cma_ip_addr *src, *dst; - __be16 port; - u8 ip_ver; - int ret; - - if (cma_get_net_info(ib_event->private_data, listen_id->ps, - &ip_ver, &port, &src, &dst)) - return NULL; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + const __be64 service_id = + ib_event->param.req_rcvd.primary_path->service_id; + int ret; - id = rdma_create_id(listen_id->event_handler, listen_id->context, + id = rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type); if (IS_ERR(id)) return NULL; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, - ip_ver, port, src, dst); + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, service_id)) + goto err; rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; @@ -1319,19 +1795,24 @@ if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { - rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; - rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); - ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); - } else { - ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, - &rt->addr.dev_addr, NULL); + if (net_dev) { + ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); if (ret) goto err; + } else { + if (!cma_protocol_roce(listen_id) && + cma_any_addr(cma_src_addr(id_priv))) { + rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); + } else if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); + if (ret) + goto err; + } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; @@ -1341,36 +1822,40 @@ } static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event) + struct ib_cm_event *ib_event, + struct net_device *net_dev) { struct rdma_id_private *id_priv; struct rdma_cm_id *id; - union cma_ip_addr *src, *dst; - __be16 port; - u8 ip_ver; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + struct vnet *net = listen_id->route.addr.dev_addr.net; int ret; - id = rdma_create_id(listen_id->event_handler, listen_id->context, + id = rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD); if (IS_ERR(id)) return NULL; - - if (cma_get_net_info(ib_event->private_data, listen_id->ps, - &ip_ver, &port, &src, &dst)) + id_priv = container_of(id, struct rdma_id_private, id); + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, + ib_event->param.sidr_req_rcvd.service_id)) goto err; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, - ip_ver, port, src, dst); - - if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr, NULL); + if (net_dev) { + ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); if (ret) goto err; + } else { + if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), + &id->route.addr.dev_addr); + if (ret) + goto err; + } } - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: @@ -1382,7 +1867,7 @@ struct ib_cm_req_event_param *req_data, void *private_data, int offset) { - event->param.conn.private_data = private_data + offset; + event->param.conn.private_data = (char *)private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; @@ -1404,38 +1889,36 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { - struct rdma_id_private *listen_id, *conn_id; + struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event; + struct net_device *net_dev; int offset, ret; - u8 smac[ETH_ALEN]; - u8 alt_smac[ETH_ALEN]; - u8 *psmac = smac; - u8 *palt_smac = alt_smac; - int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) == - RDMA_TRANSPORT_IB) && - (rdma_port_get_link_layer(cm_id->device, - ib_event->param.req_rcvd.port) == - IB_LINK_LAYER_ETHERNET)); - int is_sidr = 0; - listen_id = cm_id->context; - if (!cma_check_req_qp_type(&listen_id->id, ib_event)) - return -EINVAL; + listen_id = cma_id_from_event(cm_id, ib_event, &net_dev); + if (IS_ERR(listen_id)) + return PTR_ERR(listen_id); + + if (!cma_check_req_qp_type(&listen_id->id, ib_event)) { + ret = -EINVAL; + goto net_dev_put; + } - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) - return -ECONNABORTED; + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) { + ret = -ECONNABORTED; + goto err1; + } memset(&event, 0, sizeof event); - offset = cma_user_data_offset(listen_id->id.ps); + offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { - is_sidr = 1; - conn_id = cma_new_udp_id(&listen_id->id, ib_event); - event.param.ud.private_data = ib_event->private_data + offset; + conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev); + event.param.ud.private_data = (char *)ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { - conn_id = cma_new_conn_id(&listen_id->id, ib_event); + conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } @@ -1461,39 +1944,20 @@ ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; - - if (is_iboe && !is_sidr) { - u32 scope_id = rdma_get_ipv6_scope_id(cm_id->device, - ib_event->param.req_rcvd.port); - - if (ib_event->param.req_rcvd.primary_path != NULL) - rdma_addr_find_smac_by_sgid( - &ib_event->param.req_rcvd.primary_path->sgid, - psmac, NULL, scope_id); - else - psmac = NULL; - if (ib_event->param.req_rcvd.alternate_path != NULL) - rdma_addr_find_smac_by_sgid( - &ib_event->param.req_rcvd.alternate_path->sgid, - palt_smac, NULL, scope_id); - else - palt_smac = NULL; - } - /* - * Acquire mutex to prevent user executing rdma_destroy_id() - * while we're accessing the cm_id. - */ - mutex_lock(&lock); - if (is_iboe && !is_sidr) - ib_update_cm_av(cm_id, psmac, palt_smac); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) { - cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n"); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); - } - mutex_unlock(&lock); - mutex_unlock(&conn_id->handler_mutex); + /* + * Acquire mutex to prevent user executing rdma_destroy_id() + * while we're accessing the cm_id. + */ + mutex_lock(&lock); + if (cma_comp(conn_id, RDMA_CM_CONNECT) && + (conn_id->id.qp_type != IB_QPT_UD)) + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + mutex_unlock(&lock); + mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); cma_deref_id(conn_id); + if (net_dev) + dev_put(net_dev); return 0; err3: @@ -1507,81 +1971,34 @@ mutex_unlock(&listen_id->handler_mutex); if (conn_id) rdma_destroy_id(&conn_id->id); - return ret; -} -static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) -{ - return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); +net_dev_put: + if (net_dev) + dev_put(net_dev); + + return ret; } -static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, - struct ib_cm_compare_data *compare) +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { - struct cma_hdr *cma_data, *cma_mask; - struct sdp_hh *sdp_data, *sdp_mask; - __be32 ip4_addr; - struct in6_addr ip6_addr; - - memset(compare, 0, sizeof *compare); - cma_data = (void *) compare->data; - cma_mask = (void *) compare->mask; - sdp_data = (void *) compare->data; - sdp_mask = (void *) compare->mask; + if (addr->sa_family == AF_IB) + return ((struct sockaddr_ib *) addr)->sib_sid; - switch (addr->sa_family) { - case AF_INET: - ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; - if (ps == RDMA_PS_SDP) { - sdp_set_ip_ver(sdp_data, 4); - sdp_set_ip_ver(sdp_mask, 0xF); - if (!cma_any_addr(addr)) { - sdp_data->dst_addr.ip4.addr = ip4_addr; - sdp_mask->dst_addr.ip4.addr = htonl(~0); - } - } else { - cma_set_ip_ver(cma_data, 4); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip4.addr = ip4_addr; - cma_mask->dst_addr.ip4.addr = htonl(~0); - } - } - break; - case AF_INET6: - ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; - if (ps == RDMA_PS_SDP) { - sdp_set_ip_ver(sdp_data, 6); - sdp_set_ip_ver(sdp_mask, 0xF); - if (!cma_any_addr(addr)) { - sdp_data->dst_addr.ip6 = ip6_addr; - memset(&sdp_mask->dst_addr.ip6, 0xFF, - sizeof(sdp_mask->dst_addr.ip6)); - } - } else { - cma_set_ip_ver(cma_data, 6); - cma_set_ip_ver(cma_mask, 0xF); - if (!cma_any_addr(addr)) { - cma_data->dst_addr.ip6 = ip6_addr; - memset(&cma_mask->dst_addr.ip6, 0xFF, - sizeof(cma_mask->dst_addr.ip6)); - } - } - break; - default: - break; - } + return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } +EXPORT_SYMBOL(rdma_get_service_id); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; - struct sockaddr_in *sin; int ret = 0; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (iw_event->event) { @@ -1589,11 +2006,11 @@ event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; - *sin = iw_event->remote_addr; - switch ((int)iw_event->status) { + memcpy(cma_src_addr(id_priv), laddr, + rdma_addr_size(laddr)); + memcpy(cma_dst_addr(id_priv), raddr, + rdma_addr_size(raddr)); + switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; @@ -1633,6 +2050,7 @@ return ret; } +out: mutex_unlock(&id_priv->handler_mutex); return ret; } @@ -1642,18 +2060,20 @@ { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct sockaddr_in *sin; - struct net_device *dev = NULL; struct rdma_cm_event event; - int ret; - struct ib_device_attr attr; + int ret = -ECONNABORTED; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) - return -ECONNABORTED; + + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) + goto out; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = rdma_create_id(listen_id->id.event_handler, + new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(new_cm_id)) { @@ -1664,14 +2084,7 @@ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); - if (!dev) { - ret = -EADDRNOTAVAIL; - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -1689,17 +2102,8 @@ cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; - *sin = iw_event->remote_addr; - - ret = ib_query_device(conn_id->id.device, &attr); - if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } + memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); + memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; @@ -1728,60 +2132,42 @@ cma_deref_id(conn_id); out: - if (dev) - dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { - struct ib_cm_compare_data compare_data; struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; - int ret; - id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); + addr = cma_src_addr(id_priv); + svc_id = rdma_get_service_id(&id_priv->id, addr); + id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); - id_priv->cm_id.ib = id; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - svc_id = cma_get_service_id(id_priv->id.ps, addr); - if (cma_any_addr(addr) && !id_priv->afonly) - ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); - else { - cma_set_compare_data(id_priv->id.ps, addr, &compare_data); - ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); - } - - if (ret) { - ib_destroy_cm_id(id_priv->cm_id.ib); - id_priv->cm_id.ib = NULL; - } - - return ret; + return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; - struct sockaddr_in *sin; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, - id_priv->sock, - iw_conn_req_handler, - id_priv); + iw_conn_req_handler, + id_priv); if (IS_ERR(id)) return PTR_ERR(id); + id->tos = id_priv->tos; id_priv->cm_id.iw = id; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - id_priv->cm_id.iw->local_addr = *sin; + memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); @@ -1808,9 +2194,13 @@ { struct rdma_id_private *dev_id_priv; struct rdma_cm_id *id; + struct vnet *net = id_priv->id.route.addr.dev_addr.net; int ret; - id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) + return; + + id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) return; @@ -1818,11 +2208,10 @@ dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = RDMA_CM_ADDR_BOUND; - dev_id_priv->sock = id_priv->sock; - memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); - cma_attach_to_dev(dev_id_priv, cma_dev); + _cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; @@ -1830,7 +2219,8 @@ ret = rdma_listen(id, id_priv->backlog); if (ret) - cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name); + pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", + ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -1853,15 +2243,6 @@ } EXPORT_SYMBOL(rdma_set_service_type); -void rdma_set_timeout(struct rdma_cm_id *id, int timeout) -{ - struct rdma_id_private *id_priv; - - id_priv = container_of(id, struct rdma_id_private, id); - id_priv->qp_timeout = (u8) timeout; -} -EXPORT_SYMBOL(rdma_set_timeout); - static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { @@ -1886,31 +2267,39 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { - struct rdma_addr *addr = &id_priv->id.route.addr; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; + struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); - rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); - rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); - path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); + rdma_addr_get_sgid(dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(dev_addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; - path_rec.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &addr->dst_addr); + path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; - if (addr->src_addr.ss_family == AF_INET) { + switch (cma_family(id_priv)) { + case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; - } else { - sin6 = (struct sockaddr_in6 *) &addr->src_addr; + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + case AF_IB: + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, @@ -1944,30 +2333,6 @@ kfree(work); } -static void cma_ndev_work_handler(struct work_struct *_work) -{ - struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); - struct rdma_id_private *id_priv = work->id; - int destroy = 0; - - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == RDMA_CM_DESTROYING || - id_priv->state == RDMA_CM_DEVICE_REMOVAL) - goto out; - - if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; - } - -out: - mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); - kfree(work); -} - static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; @@ -2046,9 +2411,22 @@ return 0; } -static u8 tos_to_sl(u8 tos) +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + /* TODO: Implement this function */ + return 0; +} + +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) { - return def_prec2sl & 7; + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) @@ -2057,14 +2435,9 @@ struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; - struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; - if (src_addr->sin_family != dst_addr->sin_family) - return -EINVAL; - work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; @@ -2080,29 +2453,64 @@ route->num_paths = 1; - if (addr->dev_addr.bound_dev_if) - ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (addr->dev_addr.bound_dev_if) { + unsigned long supported_gids; + + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + + if (ndev->if_flags & IFF_LOOPBACK) { + dev_put(ndev); + if (!id_priv->id.device->get_netdev) { + ret = -EOPNOTSUPP; + goto err2; + } + + ndev = id_priv->id.device->get_netdev(id_priv->id.device, + id_priv->id.port_num); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + } + + route->path_rec->net = ndev->if_vnet; + route->path_rec->ifindex = ndev->if_index; + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + route->path_rec->gid_type = + cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + } if (!ndev) { ret = -ENODEV; goto err2; } - route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); - memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - route->path_rec->hop_limit = 1; + /* Use the hint from IP Stack to select GID Type */ + if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network); + if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) + /* TODO: get the hoplimit from the inet/inet6 device */ + route->path_rec->hop_limit = addr->dev_addr.hoplimit; + else + route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; - route->path_rec->sl = tos_to_sl(id_priv->tos); - + route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); @@ -2141,27 +2549,15 @@ return -EINVAL; atomic_inc(&id_priv->refcount); - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ret = cma_resolve_ib_route(id_priv, timeout_ms); - break; - case IB_LINK_LAYER_ETHERNET: - ret = cma_resolve_iboe_route(id_priv); - break; - default: - ret = -ENOSYS; - } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_route(id_priv, timeout_ms); + else if (rdma_protocol_roce(id->device, id->port_num)) + ret = cma_resolve_iboe_route(id_priv); + else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv, timeout_ms); - break; - default: + else ret = -ENOSYS; - break; - } + if (ret) goto err; @@ -2173,38 +2569,60 @@ } EXPORT_SYMBOL(rdma_resolve_route); -int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type) +static void cma_set_loopback(struct sockaddr *addr) { - /* APM is not supported yet */ - return -EINVAL; + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, + 0, 0, 0, htonl(1)); + break; + default: + ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, + 0, 0, 0, htonl(1)); + break; + } } -EXPORT_SYMBOL(rdma_enable_apm); static int cma_bind_loopback(struct rdma_id_private *id_priv) { - struct cma_device *cma_dev; + struct cma_device *cma_dev, *cur_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; + cma_dev = NULL; mutex_lock(&lock); - if (list_empty(&dev_list)) { + list_for_each_entry(cur_dev, &dev_list, list) { + if (cma_family(id_priv) == AF_IB && + !rdma_cap_ib_cm(cur_dev->device, 1)) + continue; + + if (!cma_dev) + cma_dev = cur_dev; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!ib_query_port(cur_dev->device, p, &port_attr) && + port_attr.state == IB_PORT_ACTIVE) { + cma_dev = cur_dev; + goto port_found; + } + } + } + + if (!cma_dev) { ret = -ENODEV; goto out; } - list_for_each_entry(cma_dev, &dev_list, list) - for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) - if (!ib_query_port(cma_dev->device, p, &port_attr) && - port_attr.state == IB_PORT_ACTIVE) - goto port_found; p = 1; - cma_dev = list_entry(dev_list.next, struct cma_device, list); port_found: - ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); + ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL); if (ret) goto out; @@ -2213,13 +2631,14 @@ goto out; id_priv->id.route.addr.dev_addr.dev_type = - (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? + (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); + cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; @@ -2237,8 +2656,7 @@ RDMA_CM_ADDR_RESOLVED)) goto out; - memcpy(&id_priv->id.route.addr.src_addr, src_addr, - ip_addr_size(src_addr)); + memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv, NULL); @@ -2266,7 +2684,6 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; - struct sockaddr *src, *dst; union ib_gid gid; int ret; @@ -2283,18 +2700,6 @@ rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - if (cma_zero_addr(src)) { - dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; - if ((src->sa_family = dst->sa_family) == AF_INET) { - ((struct sockaddr_in *)src)->sin_addr = - ((struct sockaddr_in *)dst)->sin_addr; - } else { - ((struct sockaddr_in6 *)src)->sin6_addr = - ((struct sockaddr_in6 *)dst)->sin6_addr; - } - } - work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; @@ -2307,15 +2712,23 @@ return ret; } -static int cma_resolve_scif(struct rdma_id_private *id_priv) +static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; + int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; - /* we probably can leave it empty here */ + if (!id_priv->cma_dev) { + ret = cma_resolve_ib_dev(id_priv); + if (ret) + goto err; + } + + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) + &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); @@ -2324,6 +2737,9 @@ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; +err: + kfree(work); + return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -2332,48 +2748,18 @@ if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; src_addr->sa_family = dst_addr->sa_family; -#ifdef INET6 if (dst_addr->sa_family == AF_INET6) { - ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = - ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; - } -#endif - } - if (!cma_any_addr(src_addr)) - return rdma_bind_addr(id, src_addr); - else { -#if defined(INET6) || defined(INET) - union { -#ifdef INET - struct sockaddr_in in; -#endif -#ifdef INET6 - struct sockaddr_in6 in6; -#endif - } addr; -#endif - - switch(dst_addr->sa_family) { -#ifdef INET - case AF_INET: - memset(&addr.in, 0, sizeof(addr.in)); - addr.in.sin_family = dst_addr->sa_family; - addr.in.sin_len = sizeof(addr.in); - return rdma_bind_addr(id, (struct sockaddr *)&addr.in); -#endif -#ifdef INET6 - case AF_INET6: - memset(&addr.in6, 0, sizeof(addr.in6)); - addr.in6.sin6_family = dst_addr->sa_family; - addr.in6.sin6_len = sizeof(addr.in6); - addr.in6.sin6_scope_id = - ((struct sockaddr_in6 *)dst_addr)->sin6_scope_id; - return rdma_bind_addr(id, (struct sockaddr *)&addr.in6); -#endif - default: - return -EINVAL; + struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (IN6_IS_SCOPE_LINKLOCAL(&dst_addr6->sin6_addr)) + id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *) src_addr)->sib_pkey = + ((struct sockaddr_ib *) dst_addr)->sib_pkey; } } + return rdma_bind_addr(id, src_addr); } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -2389,20 +2775,25 @@ return ret; } + if (cma_family(id_priv) != dst_addr->sa_family) + return -EINVAL; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); - memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); - if (cma_any_addr(dst_addr)) + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); - else if (id_priv->id.device && - rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF) - ret = cma_resolve_scif(id_priv); - else - ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + dst_addr, &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); + } + } if (ret) goto err; @@ -2422,7 +2813,7 @@ id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (id_priv->state == RDMA_CM_IDLE) { + if (reuse || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { @@ -2456,59 +2847,73 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { - struct sockaddr_in *sin; + struct sockaddr *addr; + struct sockaddr_ib *sib; + u64 sid, mask; + __be16 port; + + addr = cma_src_addr(id_priv); + port = htons(bind_list->port); - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - sin->sin_port = htons(bind_list->port); + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_port = port; + break; + case AF_INET6: + ((struct sockaddr_in6 *) addr)->sin6_port = port; + break; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + sid = be64_to_cpu(sib->sib_sid); + mask = be64_to_cpu(sib->sib_sid_mask); + sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); + sib->sib_sid_mask = cpu_to_be64(~0ULL); + break; + } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } -static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, - unsigned short snum) +static int cma_alloc_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; - int port, ret; + int ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; - do { - ret = idr_get_new_above(ps, bind_list, snum, &port); - } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); - - if (ret) - goto err1; - - if (port != snum) { - ret = -EADDRNOTAVAIL; - goto err2; - } + ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, + snum); + if (ret < 0) + goto err; bind_list->ps = ps; - bind_list->port = (unsigned short) port; + bind_list->port = (unsigned short)ret; cma_bind_port(bind_list, id_priv); return 0; -err2: - idr_remove(ps, port); -err1: +err: kfree(bind_list); - return ret; + return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } -static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_alloc_any_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; + struct vnet *net = id_priv->id.route.addr.dev_addr.net; + u32 rand; - inet_get_local_port_range(&init_net, &low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = random() % remaining + low; + get_random_bytes(&rand, sizeof(rand)); + rover = rand % remaining + low; retry: if (last_used_port != rover && - !idr_find(ps, (unsigned short) rover)) { + !cma_ps_find(net, ps, (unsigned short)rover)) { int ret = cma_alloc_port(ps, id_priv, rover); /* * Remember previously used port number in order to avoid @@ -2518,7 +2923,7 @@ last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; - } + } if (--remaining) { rover++; if ((rover < low) || (rover > high)) @@ -2540,7 +2945,7 @@ struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; + addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; @@ -2549,7 +2954,7 @@ cur_id->reuseaddr) continue; - cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; @@ -2563,15 +2968,19 @@ return 0; } -static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_use_port(enum rdma_port_space ps, + struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; - snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + snum = ntohs(cma_port(cma_src_addr(id_priv))); + if (snum < IPPORT_RESERVED && + priv_check(curthread, PRIV_NETINET_BINDANY) != 0) + return -EACCES; - bind_list = idr_find(ps, snum); + bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { @@ -2594,97 +3003,92 @@ return ret; } -static int cma_get_tcp_port(struct rdma_id_private *id_priv) +static enum rdma_port_space cma_select_inet_ps( + struct rdma_id_private *id_priv) { - int ret; - int size; - struct socket *sock; - - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret) - return ret; -#ifdef __linux__ - ret = sock->ops->bind(sock, - (struct sockaddr *) &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); -#else - ret = -sobind(sock, - (struct sockaddr *)&id_priv->id.route.addr.src_addr, - curthread); -#endif - if (ret) { - sock_release(sock); - return ret; + switch (id_priv->id.ps) { + case RDMA_PS_TCP: + case RDMA_PS_UDP: + case RDMA_PS_IPOIB: + case RDMA_PS_IB: + return id_priv->id.ps; + default: + + return 0; } +} - size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); - ret = sock_getname(sock, - (struct sockaddr *) &id_priv->id.route.addr.src_addr, - &size, 0); - if (ret) { - sock_release(sock); - return ret; +static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) +{ + enum rdma_port_space ps = 0; + struct sockaddr_ib *sib; + u64 sid_ps, mask, sid; + + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; + sid = be64_to_cpu(sib->sib_sid) & mask; + + if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { + sid_ps = RDMA_IB_IP_PS_IB; + ps = RDMA_PS_IB; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && + (sid == (RDMA_IB_IP_PS_TCP & mask))) { + sid_ps = RDMA_IB_IP_PS_TCP; + ps = RDMA_PS_TCP; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && + (sid == (RDMA_IB_IP_PS_UDP & mask))) { + sid_ps = RDMA_IB_IP_PS_UDP; + ps = RDMA_PS_UDP; } - id_priv->sock = sock; - return 0; + if (ps) { + sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); + sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | + be64_to_cpu(sib->sib_sid_mask)); + } + return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { - struct idr *ps; + enum rdma_port_space ps; int ret; - switch (id_priv->id.ps) { - case RDMA_PS_SDP: - ps = &sdp_ps; - break; - case RDMA_PS_TCP: - ps = &tcp_ps; - if (unify_tcp_port_space) { - ret = cma_get_tcp_port(id_priv); - if (ret) - goto out; - } - break; - case RDMA_PS_UDP: - ps = &udp_ps; - break; - case RDMA_PS_IPOIB: - ps = &ipoib_ps; - break; - case RDMA_PS_IB: - ps = &ib_ps; - break; - default: + if (cma_family(id_priv) != AF_IB) + ps = cma_select_inet_ps(id_priv); + else + ps = cma_select_ib_ps(id_priv); + if (!ps) return -EPROTONOSUPPORT; - } mutex_lock(&lock); - if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) + if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); -out: + return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { -#if defined(INET6) - struct sockaddr_in6 *sin6; +#ifdef INET6 + struct sockaddr_in6 sin6; if (addr->sa_family != AF_INET6) return 0; - sin6 = (struct sockaddr_in6 *) addr; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && - !sin6->sin6_scope_id) - return -EINVAL; + sin6 = *(struct sockaddr_in6 *)addr; + + if (!(IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr))) + return 0; - dev_addr->bound_dev_if = sin6->sin6_scope_id; + if (sa6_recoverscope(&sin6) || sin6.sin6_scope_id == 0) + return -EINVAL; + + dev_addr->bound_dev_if = sin6.sin6_scope_id; #endif return 0; } @@ -2696,8 +3100,8 @@ id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { - ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; - ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); + id->route.addr.src_addr.ss_family = AF_INET; + ret = rdma_bind_addr(id, cma_src_addr(id_priv)); if (ret) return ret; } @@ -2713,19 +3117,15 @@ id_priv->backlog = backlog; if (id->device) { - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; - break; - default: + } else { ret = -ENOSYS; goto err; } @@ -2744,12 +3144,9 @@ { struct rdma_id_private *id_priv; int ret; -#if defined(INET6) - int ipv6only; - size_t var_size = sizeof(int); -#endif - if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && + addr->sa_family != AF_IB) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); @@ -2760,9 +3157,9 @@ if (ret) goto err1; - memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); + memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL); + ret = cma_translate_addr(addr, &id->route.addr.dev_addr); if (ret) goto err1; @@ -2774,10 +3171,12 @@ if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; -#if defined(INET6) - else if (addr->sa_family == AF_INET6) - id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only", - &ipv6only, &var_size, NULL, 0, NULL, 0); +#ifdef INET6 + else if (addr->sa_family == AF_INET6) { + CURVNET_SET_QUIET(id_priv->id.route.addr.dev_addr.net); + id_priv->afonly = V_ip6_v6only; + CURVNET_RESTORE(); + } #endif } ret = cma_get_port(id_priv); @@ -2794,62 +3193,32 @@ } EXPORT_SYMBOL(rdma_bind_addr); -static int cma_format_hdr(void *hdr, enum rdma_port_space ps, - struct rdma_route *route) +static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; - struct sdp_hh *sdp_hdr; - if (route->addr.src_addr.ss_family == AF_INET) { + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; - src4 = (struct sockaddr_in *) &route->addr.src_addr; - dst4 = (struct sockaddr_in *) &route->addr.dst_addr; - - switch (ps) { - case RDMA_PS_SDP: - sdp_hdr = hdr; - if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) - return -EINVAL; - sdp_set_ip_ver(sdp_hdr, 4); - sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - sdp_hdr->port = src4->sin_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; - cma_set_ip_ver(cma_hdr, 4); - cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; - cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; - cma_hdr->port = src4->sin_port; - break; - } - } else { + src4 = (struct sockaddr_in *) cma_src_addr(id_priv); + dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; - src6 = (struct sockaddr_in6 *) &route->addr.src_addr; - dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; - - switch (ps) { - case RDMA_PS_SDP: - sdp_hdr = hdr; - if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) - return -EINVAL; - sdp_set_ip_ver(sdp_hdr, 6); - sdp_hdr->src_addr.ip6 = src6->sin6_addr; - sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; - sdp_hdr->port = src6->sin6_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; - cma_set_ip_ver(cma_hdr, 6); - cma_hdr->src_addr.ip6 = src6->sin6_addr; - cma_hdr->dst_addr.ip6 = dst6->sin6_addr; - cma_hdr->port = src6->sin6_port; - break; - } + src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 6); + cma_hdr->src_addr.ip6 = src6->sin6_addr; + cma_hdr->dst_addr.ip6 = dst6->sin6_addr; + cma_hdr->port = src6->sin6_port; } return 0; } @@ -2862,8 +3231,9 @@ struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { @@ -2879,27 +3249,28 @@ event.status = ib_event->param.sidr_rep_rcvd.status; break; } - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; - event.status = -EINVAL; + event.status = ret; break; } - if (id_priv->qkey != rep->qkey) { - event.event = RDMA_CM_EVENT_UNREACHABLE; - event.status = -EINVAL; + ret = ib_init_ah_from_path(id_priv->id.device, + id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr); + if (ret) { + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = ret; break; } - ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, - id_priv->id.route.path_rec, - &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -2922,27 +3293,34 @@ struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; - struct rdma_route *route; struct ib_cm_id *id; - int ret; + void *private_data; + int offset, ret; - req.private_data_len = sizeof(struct cma_hdr) + - conn_param->private_data_len; + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!req.private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) - memcpy((void *) req.private_data + sizeof(struct cma_hdr), - conn_param->private_data, conn_param->private_data_len); + memcpy((char *)private_data + offset, conn_param->private_data, + conn_param->private_data_len); - route = &id_priv->id.route; - ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); - if (ret) - goto out; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); @@ -2952,20 +3330,18 @@ } id_priv->cm_id.ib = id; - req.path = route->path_rec; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); - req.timeout_ms = 1 << (cma_response_timeout - 8); + req.path = id_priv->id.route.path_rec; + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; - cma_dbg(id_priv, "sending SIDR\n"); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: - kfree(req.private_data); + kfree(private_data); return ret; } @@ -2979,17 +3355,21 @@ int offset, ret; memset(&req, 0, sizeof req); - offset = cma_user_data_offset(id_priv->id.ps); + offset = cma_user_data_offset(id_priv); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) - memcpy(private_data + offset, conn_param->private_data, + memcpy((char *)private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); @@ -3000,17 +3380,18 @@ id_priv->cm_id.ib = id; route = &id_priv->id.route; - ret = cma_format_hdr(private_data, id_priv->id.ps, route); - if (ret) - goto out; - req.private_data = private_data; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; @@ -3019,12 +3400,11 @@ req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); - req.remote_cm_response_timeout = cma_response_timeout; - req.local_cm_response_timeout = cma_response_timeout; + req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; - cma_dbg(id_priv, "sending REQ\n"); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { @@ -3040,32 +3420,30 @@ struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; - struct sockaddr_in* sin; int ret; struct iw_cm_conn_param iw_param; - cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, - cma_iw_handler, id_priv); + cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); + cm_id->tos = id_priv->tos; id_priv->cm_id.iw = cm_id; - sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; - cm_id->local_addr = *sin; - - sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; - cm_id->remote_addr = *sin; + memcpy(&cm_id->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), + rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { - iw_param.ord = conn_param->initiator_depth; - iw_param.ird = conn_param->responder_resources; - iw_param.private_data = conn_param->private_data; - iw_param.private_data_len = conn_param->private_data_len; + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); @@ -3094,21 +3472,15 @@ id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_connect_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto err; @@ -3144,7 +3516,7 @@ rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; - cma_dbg(id_priv, "sending REP\n"); + ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; @@ -3156,9 +3528,6 @@ struct iw_cm_conn_param iw_param; int ret; - if (!conn_param) - return -EINVAL; - ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; @@ -3176,7 +3545,7 @@ } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, - enum ib_cm_sidr_status status, + enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; @@ -3185,7 +3554,7 @@ memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, qkey); if (ret) return ret; rep.qp_num = id_priv->qp_num; @@ -3194,7 +3563,6 @@ rep.private_data = private_data; rep.private_data_len = private_data_len; - cma_dbg(id_priv, "sending SIDR\n"); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } @@ -3205,7 +3573,8 @@ id_priv = container_of(id, struct rdma_id_private, id); - id_priv->owner = curthread->td_proc->p_pid; + id_priv->owner = task_pid_nr(current); + if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; @@ -3214,31 +3583,26 @@ id_priv->srq = conn_param->srq; } - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - conn_param->private_data, - conn_param->private_data_len); + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->qkey, + conn_param->private_data, + conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - NULL, 0); + 0, NULL, 0); } else { if (conn_param) - ret = cma_accept_ib(id_priv, conn_param); - else - ret = cma_rep_recv(id_priv); + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) ret = cma_accept_iw(id_priv, conn_param); - break; - default: + else ret = -ENOSYS; - break; - } if (ret) goto reject; @@ -3282,27 +3646,20 @@ if (!id_priv->cm_id.ib) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); - else { - cma_dbg(id_priv, "sending REJ\n"); + else ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); - } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); - break; - default: + } else ret = -ENOSYS; - break; - } + return ret; } EXPORT_SYMBOL(rdma_reject); @@ -3316,26 +3673,18 @@ if (!id_priv->cm_id.ib) return -EINVAL; - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: + if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ - cma_dbg(id_priv, "sending DREQ\n"); - if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { - cma_dbg(id_priv, "sending DREP\n"); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); - } - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); - break; - default: + } else ret = -EINVAL; - break; - } + out: return ret; } @@ -3346,17 +3695,16 @@ struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; - struct rdma_dev_addr *dev_addr; - int ret; - struct net_device *ndev = NULL; - u16 vlan; + int ret = 0; id_priv = mc->id_priv; - dev_addr = &id_priv->id.route.addr.dev_addr; - if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && - cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_ADDR_BOUND && + id_priv->state != RDMA_CM_ADDR_RESOLVED) + goto out; + if (!status) + status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, @@ -3366,32 +3714,27 @@ memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (!ndev) { - status = -ENODEV; - } else { - vlan = rdma_vlan_dev_vlan_id(ndev); - dev_put(ndev); - } if (!status) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = + dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + enum ib_gid_type gid_type = + id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, + ndev, gid_type, &event.param.ud.ah_attr); - event.param.ud.ah_attr.vlan_id = vlan; event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - } else { + if (ndev) + dev_put(ndev); + } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; - /* mark that the cached record is no longer valid */ - if (status != -ENETRESET && status != -EAGAIN) { - spin_lock(&id_priv->lock); - id_priv->is_valid_rec = 0; - spin_unlock(&id_priv->lock); - } - } - ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, RDMA_CM_DESTROYING); @@ -3400,6 +3743,7 @@ return 0; } +out: mutex_unlock(&id_priv->handler_mutex); return 0; } @@ -3410,24 +3754,22 @@ unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; -#if defined(INET6) struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; -#endif if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); -#if defined(INET6) } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if (addr->sa_family == AF_IB) { + memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); -#endif } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) @@ -3436,39 +3778,99 @@ } } +static void cma_query_sa_classport_info_cb(int status, + struct ib_class_port_info *rec, + void *context) +{ + struct class_port_info_context *cb_ctx = context; + + WARN_ON(!context); + + if (status || !rec) { + pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n", + cb_ctx->device->name, cb_ctx->port_num, status); + goto out; + } + + memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info)); + +out: + complete(&cb_ctx->done); +} + +static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num, + struct ib_class_port_info *class_port_info) +{ + struct class_port_info_context *cb_ctx; + int ret; + + cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL); + if (!cb_ctx) + return -ENOMEM; + + cb_ctx->device = device; + cb_ctx->class_port_info = class_port_info; + cb_ctx->port_num = port_num; + init_completion(&cb_ctx->done); + + ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num, + CMA_QUERY_CLASSPORT_INFO_TIMEOUT, + GFP_KERNEL, cma_query_sa_classport_info_cb, + cb_ctx, &cb_ctx->sa_query); + if (ret < 0) { + pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n", + device->name, port_num, ret); + goto out; + } + + wait_for_completion(&cb_ctx->done); + +out: + kfree(cb_ctx); + return ret; +} + static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; + struct ib_class_port_info class_port_info; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; - int ret = 0; + int ret; - ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid); + ib_addr_get_mgid(dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, + &rec.mgid, &rec); + if (ret) + return ret; - /* cache ipoib bc record */ - spin_lock(&id_priv->lock); - if (!id_priv->is_valid_rec) - ret = ib_sa_get_mcmember_rec(id_priv->id.device, - id_priv->id.port_num, - &id_priv->rec.mgid, - &id_priv->rec); - if (ret) { - id_priv->is_valid_rec = 0; - spin_unlock(&id_priv->lock); + ret = cma_set_qkey(id_priv, 0); + if (ret) return ret; - } else { - rec = id_priv->rec; - id_priv->is_valid_rec = 1; - } - spin_unlock(&id_priv->lock); cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); - if (id_priv->id.ps == RDMA_PS_UDP) - rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); - rec.join_state = 1; + rec.join_state = mc->join_state; + + if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) { + ret = cma_query_sa_classport_info(id_priv->id.device, + id_priv->id.port_num, + &class_port_info); + + if (ret) + return ret; + + if (!(ib_get_cpi_capmask2(&class_port_info) & + IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) { + pr_warn("RDMA CM: %s port %u Unable to multicast join\n" + "RDMA CM: SM doesn't support Send Only Full Member option\n", + id_priv->id.device->name, id_priv->id.port_num); + return -EOPNOTSUPP; + } + } comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | @@ -3487,7 +3889,7 @@ id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); - return PTR_RET(mc->multicast.ib); + return PTR_ERR_OR_ZERO(mc->multicast.ib); } static void iboe_mcast_work_handler(struct work_struct *work) @@ -3533,9 +3935,13 @@ { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - int err; + int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + enum ib_gid_type gid_type; + bool send_only; + + send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; @@ -3557,7 +3963,7 @@ mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; @@ -3565,9 +3971,24 @@ mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); + + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + if (addr->sa_family == AF_INET) { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + if (!send_only) { + mc->igmp_joined = true; + } + } + } else { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = -ENOTSUPP; + } dev_put(ndev); - if (!mc->multicast.ib->rec.mtu) { - err = -EINVAL; + if (err || !mc->multicast.ib->rec.mtu) { + if (!err) + err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, @@ -3588,7 +4009,7 @@ } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context) + u8 join_state, void *context) { struct rdma_id_private *id_priv; struct cma_multicast *mc; @@ -3603,32 +4024,22 @@ if (!mc) return -ENOMEM; - memcpy(&mc->addr, addr, ip_addr_size(addr)); + memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - + mc->igmp_joined = false; + mc->join_state = join_state; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ret = cma_join_ib_multicast(id_priv, mc); - break; - case IB_LINK_LAYER_ETHERNET: - kref_init(&mc->mcref); - ret = cma_iboe_join_multicast(id_priv, mc); - break; - default: - ret = -EINVAL; - } - break; - default: + if (rdma_protocol_roce(id->device, id->port_num)) { + kref_init(&mc->mcref); + ret = cma_iboe_join_multicast(id_priv, mc); + } else if (rdma_cap_ib_mcast(id->device, id->port_num)) + ret = cma_join_ib_multicast(id_priv, mc); + else ret = -ENOSYS; - break; - } if (ret) { spin_lock_irq(&id_priv->lock); @@ -3648,7 +4059,7 @@ id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { + if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); @@ -3656,18 +4067,27 @@ ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, be16_to_cpu(mc->multicast.ib->rec.mlid)); - if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { - switch (rdma_port_get_link_layer(id->device, id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - break; - case IB_LINK_LAYER_ETHERNET: - kref_put(&mc->mcref, release_mc); - break; - default: - break; + + BUG_ON(id_priv->cma_dev->device != id->device); + + if (rdma_cap_ib_mcast(id->device, id->port_num)) { + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + } else if (rdma_protocol_roce(id->device, id->port_num)) { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id->route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev) { + dev_put(ndev); + } + mc->igmp_joined = false; } + kref_put(&mc->mcref, release_mc); } return; } @@ -3676,80 +4096,60 @@ } EXPORT_SYMBOL(rdma_leave_multicast); -static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) +static int +sysctl_cma_default_roce_mode(SYSCTL_HANDLER_ARGS) { - struct rdma_dev_addr *dev_addr; - struct cma_ndev_work *work; - - dev_addr = &id_priv->id.route.addr.dev_addr; - - if ((dev_addr->bound_dev_if == ndev->if_index) && - memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { - printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", - ndev->if_xname, &id_priv->id); - work = kzalloc(sizeof *work, GFP_KERNEL); - if (!work) - return -ENOMEM; - - INIT_WORK(&work->work, cma_ndev_work_handler); - work->id = id_priv; - work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; - atomic_inc(&id_priv->refcount); - queue_work(cma_wq, &work->work); + struct cma_device *cma_dev = arg1; + const int port = arg2; + char buf[64]; + int error; + + strlcpy(buf, ib_cache_gid_type_str( + cma_get_default_gid_type(cma_dev, port)), sizeof(buf)); + + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + goto done; + + error = ib_cache_gid_parse_type_str(buf); + if (error < 0) { + error = EINVAL; + goto done; } - return 0; -} - -static int cma_netdev_callback(struct notifier_block *self, unsigned long event, - void *ctx) -{ - struct net_device *ndev = (struct net_device *)ctx; - struct cma_device *cma_dev; - struct rdma_id_private *id_priv; - int ret = NOTIFY_DONE; - -/* BONDING related, commented out until the bonding is resolved */ -#if 0 - if (dev_net(ndev) != &init_net) - return NOTIFY_DONE; - - if (event != NETDEV_BONDING_FAILOVER) - return NOTIFY_DONE; - - if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) - return NOTIFY_DONE; -#endif - if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) - return NOTIFY_DONE; - - mutex_lock(&lock); - list_for_each_entry(cma_dev, &dev_list, list) - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - ret = cma_netdev_change(ndev, id_priv); - if (ret) - goto out; - } - -out: - mutex_unlock(&lock); - return ret; + cma_set_default_gid_type(cma_dev, port, error); + error = 0; +done: + return (error); } -static struct notifier_block cma_nb = { - .notifier_call = cma_netdev_callback -}; - static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; + unsigned int i; + unsigned long supported_gids = 0; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; + sysctl_ctx_init(&cma_dev->sysctl_ctx); + cma_dev->device = device; + cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_gid_type), + GFP_KERNEL); + if (!cma_dev->default_gid_type) { + kfree(cma_dev); + return; + } + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + supported_gids = roce_gid_type_mask_support(device, i); + WARN_ON(!supported_gids); + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); + } init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); @@ -3761,6 +4161,18 @@ list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); + + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + char buf[64]; + + snprintf(buf, sizeof(buf), "default_roce_mode_port%d", i); + + (void) SYSCTL_ADD_PROC(&cma_dev->sysctl_ctx, + SYSCTL_CHILDREN(device->ports_parent->parent->oidp), + OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + cma_dev, i, &sysctl_cma_default_roce_mode, "A", + "Default RoCE mode"); + } } static int cma_remove_id_dev(struct rdma_id_private *id_priv) @@ -3817,11 +4229,10 @@ wait_for_completion(&cma_dev->comp); } -static void cma_remove_one(struct ib_device *device) +static void cma_remove_one(struct ib_device *device, void *client_data) { - struct cma_device *cma_dev; + struct cma_device *cma_dev = client_data; - cma_dev = ib_get_client_data(device, &cma_client); if (!cma_dev) return; @@ -3830,56 +4241,66 @@ mutex_unlock(&lock); cma_process_remove(cma_dev); + sysctl_ctx_free(&cma_dev->sysctl_ctx); + kfree(cma_dev->default_gid_type); kfree(cma_dev); } +static void cma_init_vnet(void *arg) +{ + struct cma_pernet *pernet = &VNET(cma_pernet); + + idr_init(&pernet->tcp_ps); + idr_init(&pernet->udp_ps); + idr_init(&pernet->ipoib_ps); + idr_init(&pernet->ib_ps); +} +VNET_SYSINIT(cma_init_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_FIRST, cma_init_vnet, NULL); + +static void cma_destroy_vnet(void *arg) +{ + struct cma_pernet *pernet = &VNET(cma_pernet); + + idr_destroy(&pernet->tcp_ps); + idr_destroy(&pernet->udp_ps); + idr_destroy(&pernet->ipoib_ps); + idr_destroy(&pernet->ib_ps); +} +VNET_SYSUNINIT(cma_destroy_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_SECOND, cma_destroy_vnet, NULL); + static int __init cma_init(void) { - int ret = -ENOMEM; + int ret; - cma_wq = create_singlethread_workqueue("rdma_cm"); + cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; - cma_free_wq = create_singlethread_workqueue("rdma_cm_fr"); - if (!cma_free_wq) - goto err1; - ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); - register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); if (ret) goto err; + cma_configfs_init(); + return 0; err: - unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); - - destroy_workqueue(cma_free_wq); -err1: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { + cma_configfs_exit(); ib_unregister_client(&cma_client); - unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); - flush_workqueue(cma_free_wq); - destroy_workqueue(cma_free_wq); destroy_workqueue(cma_wq); - idr_destroy(&sdp_ps); - idr_destroy(&tcp_ps); - idr_destroy(&udp_ps); - idr_destroy(&ipoib_ps); - idr_destroy(&ib_ps); } module_init(cma_init); Index: sys/ofed/drivers/infiniband/core/ib_cq.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_cq.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#define IB_CQ_POLL_MAX 16 +/* maximum number of completions per poll loop */ +#define IB_CQ_POLL_BUDGET 65536 +#define IB_CQ_POLL_FLAGS (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) + +static void +ib_cq_poll_work(struct work_struct *work) +{ + struct ib_wc ib_wc[IB_CQ_POLL_MAX]; + struct ib_cq *cq = container_of(work, struct ib_cq, work); + int total = 0; + int i; + int n; + + while (1) { + n = ib_poll_cq(cq, IB_CQ_POLL_MAX, ib_wc); + for (i = 0; i < n; i++) { + struct ib_wc *wc = ib_wc + i; + + if (wc->wr_cqe != NULL) + wc->wr_cqe->done(cq, wc); + } + + if (n != IB_CQ_POLL_MAX) { + if (ib_req_notify_cq(cq, IB_CQ_POLL_FLAGS) > 0) + break; + else + return; + } + total += n; + if (total >= IB_CQ_POLL_BUDGET) + break; + } + + /* give other work structs a chance */ + queue_work(ib_comp_wq, &cq->work); +} + +static void +ib_cq_completion_workqueue(struct ib_cq *cq, void *private) +{ + queue_work(ib_comp_wq, &cq->work); +} + +struct ib_cq * +ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +{ + struct ib_cq_init_attr cq_attr = { + .cqe = nr_cqe, + .comp_vector = comp_vector, + }; + struct ib_cq *cq; + + /* + * Check for invalid parameters early on to avoid + * extra error handling code: + */ + switch (poll_ctx) { + case IB_POLL_DIRECT: + case IB_POLL_SOFTIRQ: + case IB_POLL_WORKQUEUE: + break; + default: + return (ERR_PTR(-EINVAL)); + } + + cq = dev->create_cq(dev, &cq_attr, NULL, NULL); + if (IS_ERR(cq)) + return (cq); + + cq->device = dev; + cq->uobject = NULL; + cq->event_handler = NULL; + cq->cq_context = private; + cq->poll_ctx = poll_ctx; + atomic_set(&cq->usecnt, 0); + + switch (poll_ctx) { + case IB_POLL_DIRECT: + cq->comp_handler = NULL; /* no hardware completions */ + break; + case IB_POLL_SOFTIRQ: + case IB_POLL_WORKQUEUE: + cq->comp_handler = ib_cq_completion_workqueue; + INIT_WORK(&cq->work, ib_cq_poll_work); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + default: + break; + } + return (cq); +} +EXPORT_SYMBOL(ib_alloc_cq); + +void +ib_free_cq(struct ib_cq *cq) +{ + + if (WARN_ON_ONCE(atomic_read(&cq->usecnt) != 0)) + return; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + break; + case IB_POLL_SOFTIRQ: + case IB_POLL_WORKQUEUE: + flush_work(&cq->work); + break; + default: + break; + } + + (void)cq->device->destroy_cq(cq); +} +EXPORT_SYMBOL(ib_free_cq); Index: sys/ofed/drivers/infiniband/core/ib_device.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_device.c +++ sys/ofed/drivers/infiniband/core/ib_device.c @@ -37,6 +37,9 @@ #include #include #include +#include +#include +#include #include "core_priv.h" @@ -48,22 +51,35 @@ struct list_head list; struct ib_client *client; void * data; + /* The device or client is going down. Do not call client or device + * callbacks other than remove(). */ + bool going_down; }; +struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +/* The device_list and client_list contain devices and clients after their + * registration has completed, and the devices and clients are removed + * during unregistration. */ static LIST_HEAD(device_list); static LIST_HEAD(client_list); /* - * device_mutex protects access to both device_list and client_list. - * There's no real point to using multiple locks or something fancier - * like an rwsem: we always access both lists, and we're always - * modifying one list or the other list. In any case this is not a - * hot path so there's no point in trying to optimize. + * device_mutex and lists_rwsem protect access to both device_list and + * client_list. device_mutex protects writer access by device and client + * registration / de-registration. lists_rwsem protects reader access to + * these lists. Iterators of these lists must lock it for read, while updates + * to the lists must be done with a write lock. A special case is when the + * device_mutex is locked. In this case locking the lists for read access is + * not necessary as the device_mutex implies it. + * + * lists_rwsem also protects access to the client data list. */ static DEFINE_MUTEX(device_mutex); +static DECLARE_RWSEM(lists_rwsem); + static int ib_device_check_mandatory(struct ib_device *device) { @@ -90,14 +106,15 @@ IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), - IB_MANDATORY_FUNC(dereg_mr) + IB_MANDATORY_FUNC(dereg_mr), + IB_MANDATORY_FUNC(get_port_immutable) }; int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { - if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - printk(KERN_WARNING "Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + if (!*(void **) ((char *) device + mandatory_table[i].offset)) { + pr_warn("Device %s is missing mandatory function %s\n", + device->name, mandatory_table[i].name); return -EINVAL; } } @@ -149,18 +166,20 @@ return 0; } -static int start_port(struct ib_device *device) +static void ib_device_release(struct device *device) { - return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; -} + struct ib_device *dev = container_of(device, struct ib_device, dev); - -static int end_port(struct ib_device *device) -{ - return (device->node_type == RDMA_NODE_IB_SWITCH) ? - 0 : device->phys_port_cnt; + ib_cache_release_one(dev); + kfree(dev->port_immutable); + kfree(dev); } +static struct class ib_class = { + .name = "infiniband", + .dev_release = ib_device_release, +}; + /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate @@ -173,14 +192,28 @@ */ struct ib_device *ib_alloc_device(size_t size) { - struct ib_device *dev; + struct ib_device *device; + + if (WARN_ON(size < sizeof(struct ib_device))) + return NULL; - BUG_ON(size < sizeof (struct ib_device)); + device = kzalloc(size, GFP_KERNEL); + if (!device) + return NULL; - dev = kzalloc(size, GFP_KERNEL); - spin_lock_init(&dev->cmd_perf_lock); + device->dev.parent = &linux_root_device; + device->dev.class = &ib_class; + device_initialize(&device->dev); - return dev; + dev_set_drvdata(&device->dev, device); + + INIT_LIST_HEAD(&device->event_handler_list); + spin_lock_init(&device->event_handler_lock); + spin_lock_init(&device->client_data_lock); + INIT_LIST_HEAD(&device->client_data_list); + INIT_LIST_HEAD(&device->port_list); + + return device; } EXPORT_SYMBOL(ib_alloc_device); @@ -192,13 +225,8 @@ */ void ib_dealloc_device(struct ib_device *device) { - if (device->reg_state == IB_DEV_UNINITIALIZED) { - kfree(device); - return; - } - - BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); - + WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && + device->reg_state != IB_DEV_UNINITIALIZED); kobject_put(&device->dev.kobj); } EXPORT_SYMBOL(ib_dealloc_device); @@ -210,59 +238,70 @@ context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { - printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", - device->name, client->name); + pr_warn("Couldn't allocate client context for %s/%s\n", + device->name, client->name); return -ENOMEM; } context->client = client; context->data = NULL; + context->going_down = false; + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_add(&context->list, &device->client_data_list); spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); return 0; } -static int read_port_table_lengths(struct ib_device *device) +static int verify_immutable(const struct ib_device *dev, u8 port) { - struct ib_port_attr *tprops = NULL; - int num_ports, ret = -ENOMEM; - u8 port_index; - - tprops = kmalloc(sizeof *tprops, GFP_KERNEL); - if (!tprops) - goto out; - - num_ports = end_port(device) - start_port(device) + 1; + return WARN_ON(!rdma_cap_ib_mad(dev, port) && + rdma_max_mad_size(dev, port) != 0); +} - device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, - GFP_KERNEL); - device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, - GFP_KERNEL); - if (!device->pkey_tbl_len || !device->gid_tbl_len) - goto err; +static int read_port_immutable(struct ib_device *device) +{ + int ret; + u8 start_port = rdma_start_port(device); + u8 end_port = rdma_end_port(device); + u8 port; + + /** + * device->port_immutable is indexed directly by the port number to make + * access to this data as efficient as possible. + * + * Therefore port_immutable is declared as a 1 based array with + * potential empty slots at the beginning. + */ + device->port_immutable = kzalloc(sizeof(*device->port_immutable) + * (end_port + 1), + GFP_KERNEL); + if (!device->port_immutable) + return -ENOMEM; - for (port_index = 0; port_index < num_ports; ++port_index) { - ret = ib_query_port(device, port_index + start_port(device), - tprops); + for (port = start_port; port <= end_port; ++port) { + ret = device->get_port_immutable(device, port, + &device->port_immutable[port]); if (ret) - goto err; - device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; - device->gid_tbl_len[port_index] = tprops->gid_tbl_len; - } + return ret; - ret = 0; - goto out; + if (verify_immutable(device, port)) + return -EINVAL; + } + return 0; +} -err: - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); -out: - kfree(tprops); - return ret; +void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) +{ + if (dev->get_dev_fw_str) + dev->get_dev_fw_str(dev, str, str_len); + else + str[0] = '\0'; } +EXPORT_SYMBOL(ib_get_device_fw_str); /** * ib_register_device - Register an IB device with IB core @@ -278,6 +317,8 @@ u8, struct kobject *)) { int ret; + struct ib_client *client; + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; mutex_lock(&device_mutex); @@ -292,40 +333,45 @@ goto out; } - INIT_LIST_HEAD(&device->event_handler_list); - INIT_LIST_HEAD(&device->client_data_list); - spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); + ret = read_port_immutable(device); + if (ret) { + pr_warn("Couldn't create per port immutable data %s\n", + device->name); + goto out; + } - ret = read_port_table_lengths(device); + ret = ib_cache_setup_one(device); if (ret) { - printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", - device->name); + pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); goto out; } - ret = ib_device_register_sysfs(device, port_callback); + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); if (ret) { - printk(KERN_WARNING "Couldn't register device %s with driver model\n", - device->name); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); + pr_warn("Couldn't query the device attributes\n"); + ib_cache_cleanup_one(device); goto out; } - list_add_tail(&device->core_list, &device_list); + ret = ib_device_register_sysfs(device, port_callback); + if (ret) { + pr_warn("Couldn't register device %s with driver model\n", + device->name); + ib_cache_cleanup_one(device); + goto out; + } device->reg_state = IB_DEV_REGISTERED; - { - struct ib_client *client; - - list_for_each_entry(client, &client_list, list) - if (client->add && !add_client_context(device, client)) - client->add(device); - } + list_for_each_entry(client, &client_list, list) + if (client->add && !add_client_context(device, client)) + client->add(device); - out: + down_write(&lists_rwsem); + list_add_tail(&device->core_list, &device_list); + up_write(&lists_rwsem); +out: mutex_unlock(&device_mutex); return ret; } @@ -339,29 +385,37 @@ */ void ib_unregister_device(struct ib_device *device) { - struct ib_client *client; struct ib_client_data *context, *tmp; unsigned long flags; mutex_lock(&device_mutex); - list_for_each_entry_reverse(client, &client_list, list) - if (client->remove) - client->remove(device); - + down_write(&lists_rwsem); list_del(&device->core_list); + spin_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + context->going_down = true; + spin_unlock_irqrestore(&device->client_data_lock, flags); + downgrade_write(&lists_rwsem); - kfree(device->gid_tbl_len); - kfree(device->pkey_tbl_len); + list_for_each_entry_safe(context, tmp, &device->client_data_list, + list) { + if (context->client->remove) + context->client->remove(device, context->data); + } + up_read(&lists_rwsem); mutex_unlock(&device_mutex); ib_device_unregister_sysfs(device); + ib_cache_cleanup_one(device); + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; } @@ -386,11 +440,14 @@ mutex_lock(&device_mutex); - list_add_tail(&client->list, &client_list); list_for_each_entry(device, &device_list, core_list) if (client->add && !add_client_context(device, client)) client->add(device); + down_write(&lists_rwsem); + list_add_tail(&client->list, &client_list); + up_write(&lists_rwsem); + mutex_unlock(&device_mutex); return 0; @@ -413,19 +470,41 @@ mutex_lock(&device_mutex); + down_write(&lists_rwsem); + list_del(&client->list); + up_write(&lists_rwsem); + list_for_each_entry(device, &device_list, core_list) { - if (client->remove) - client->remove(device); + struct ib_client_data *found_context = NULL; + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) if (context->client == client) { - list_del(&context->list); - kfree(context); + context->going_down = true; + found_context = context; + break; } spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); + + if (client->remove) + client->remove(device, found_context ? + found_context->data : NULL); + + if (!found_context) { + pr_warn("No client context found for %s/%s\n", + device->name, client->name); + continue; + } + + down_write(&lists_rwsem); + spin_lock_irqsave(&device->client_data_lock, flags); + list_del(&found_context->list); + kfree(found_context); + spin_unlock_irqrestore(&device->client_data_lock, flags); + up_write(&lists_rwsem); } - list_del(&client->list); mutex_unlock(&device_mutex); } @@ -479,8 +558,8 @@ goto out; } - printk(KERN_WARNING "No client context found for %s/%s\n", - device->name, client->name); + pr_warn("No client context found for %s/%s\n", + device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); @@ -551,21 +630,6 @@ EXPORT_SYMBOL(ib_dispatch_event); /** - * ib_query_device - Query IB device attributes - * @device:Device to query - * @device_attr:Device attributes - * - * ib_query_device() returns the attributes of a device through the - * @device_attr pointer. - */ -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr) -{ - return device->query_device(device, device_attr); -} -EXPORT_SYMBOL(ib_query_device); - -/** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query @@ -578,10 +642,26 @@ u8 port_num, struct ib_port_attr *port_attr) { - if (port_num < start_port(device) || port_num > end_port(device)) + union ib_gid gid; + int err; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; - return device->query_port(device, port_num, port_attr); + memset(port_attr, 0, sizeof(*port_attr)); + err = device->query_port(device, port_num, port_attr); + if (err || port_attr->subnet_prefix) + return err; + + if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) + return 0; + + err = ib_query_gid(device, port_num, 0, &gid, NULL); + if (err) + return err; + + port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); + return 0; } EXPORT_SYMBOL(ib_query_port); @@ -591,17 +671,91 @@ * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID + * @attr: Returned GID attributes related to this GID index (only in RoCE). + * NULL means ignore. * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid) + u8 port_num, int index, union ib_gid *gid, + struct ib_gid_attr *attr) { + if (rdma_cap_roce_gid_table(device, port_num)) + return ib_get_cached_gid(device, port_num, index, gid, attr); + + if (attr) + return -EINVAL; + return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** + * ib_enum_roce_netdev - enumerate all RoCE ports + * @ib_dev : IB device we want to query + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all of the physical RoCE ports of ib_dev + * which are related to netdevice and calls callback() on each + * device for which filter() function returns non zero. + */ +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + u8 port; + + for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); + port++) + if (rdma_protocol_roce(ib_dev, port)) { + struct net_device *idev = NULL; + + if (ib_dev->get_netdev) + idev = ib_dev->get_netdev(ib_dev, port); + + if (idev && (idev->if_flags & IFF_DYING)) { + dev_put(idev); + idev = NULL; + } + + if (filter(ib_dev, port, idev, filter_cookie)) + cb(ib_dev, port, idev, cookie); + + if (idev) + dev_put(idev); + } +} + +/** + * ib_enum_all_roce_netdevs - enumerate all RoCE devices + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all RoCE devices' physical ports which are related + * to netdevices and calls callback() on each device for which + * filter() function returns non zero. + */ +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + struct ib_device *dev; + + down_read(&lists_rwsem); + list_for_each_entry(dev, &device_list, core_list) + ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); + up_read(&lists_rwsem); +} + +/** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query @@ -656,7 +810,7 @@ if (!device->modify_port) return -ENOSYS; - if (port_num < start_port(device) || port_num > end_port(device)) + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, @@ -669,19 +823,33 @@ * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: Type of GID. + * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; - for (port = start_port(device); port <= end_port(device); ++port) { - for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { - ret = ib_query_gid(device, port, i, &tmp_gid); + for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + if (rdma_cap_roce_gid_table(device, port)) { + if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, + ndev, index)) { + *port_num = port; + return 0; + } + } + + if (gid_type != IB_GID_TYPE_IB) + continue; + + for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { + ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { @@ -712,7 +880,7 @@ u16 tmp_pkey; int partial_ix = -1; - for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { + for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; @@ -736,31 +904,103 @@ } EXPORT_SYMBOL(ib_find_pkey); +/** + * ib_get_net_dev_by_params() - Return the appropriate net_dev + * for a received CM request + * @dev: An RDMA device on which the request has been received. + * @port: Port number on the RDMA device. + * @pkey: The Pkey the request came on. + * @gid: A GID that the net_dev uses to communicate. + * @addr: Contains the IP address that the request specified as its + * destination. + */ +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, + u8 port, + u16 pkey, + const union ib_gid *gid, + const struct sockaddr *addr) +{ + struct net_device *net_dev = NULL; + struct ib_client_data *context; + + if (!rdma_protocol_ib(dev, port)) + return NULL; + + down_read(&lists_rwsem); + + list_for_each_entry(context, &dev->client_data_list, list) { + struct ib_client *client = context->client; + + if (context->going_down) + continue; + + if (client->get_net_dev_by_params) { + net_dev = client->get_net_dev_by_params(dev, port, pkey, + gid, addr, + context->data); + if (net_dev) + break; + } + } + + up_read(&lists_rwsem); + + return net_dev; +} +EXPORT_SYMBOL(ib_get_net_dev_by_params); + static int __init ib_core_init(void) { int ret; - ib_wq = create_workqueue("infiniband"); + ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; - ret = ib_sysfs_setup(); - if (ret) { - printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); + ib_comp_wq = alloc_workqueue("ib-comp-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, + mp_ncpus * 4 /* WQ_UNBOUND_MAX_ACTIVE */); + if (!ib_comp_wq) { + ret = -ENOMEM; goto err; } - ret = ib_cache_setup(); + ret = class_register(&ib_class); + if (ret) { + pr_warn("Couldn't create InfiniBand device class\n"); + goto err_comp; + } + + ret = addr_init(); if (ret) { - printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); + pr_warn("Could't init IB address resolution\n"); goto err_sysfs; } + ret = ib_mad_init(); + if (ret) { + pr_warn("Couldn't init IB MAD\n"); + goto err_addr; + } + + ret = ib_sa_init(); + if (ret) { + pr_warn("Couldn't init SA\n"); + goto err_mad; + } + + ib_cache_setup(); + return 0; +err_mad: + ib_mad_cleanup(); +err_addr: + addr_cleanup(); err_sysfs: - ib_sysfs_cleanup(); - + class_unregister(&ib_class); +err_comp: + destroy_workqueue(ib_comp_wq); err: destroy_workqueue(ib_wq); return ret; @@ -769,7 +1009,11 @@ static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); - ib_sysfs_cleanup(); + ib_sa_cleanup(); + ib_mad_cleanup(); + addr_cleanup(); + class_unregister(&ib_class); + destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } @@ -777,17 +1021,5 @@ module_init(ib_core_init); module_exit(ib_core_cleanup); -static int -ibcore_evhand(module_t mod, int event, void *arg) -{ - return (0); -} - -static moduledata_t ibcore_mod = { - .name = "ibcore", - .evhand = ibcore_evhand, -}; - MODULE_VERSION(ibcore, 1); MODULE_DEPEND(ibcore, linuxkpi, 1, 1, 1); -DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_LAST, SI_ORDER_ANY); Index: sys/ofed/drivers/infiniband/core/ib_fmr_pool.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_fmr_pool.c +++ sys/ofed/drivers/infiniband/core/ib_fmr_pool.c @@ -33,7 +33,6 @@ #include #include -#include #include #include #include @@ -150,8 +149,8 @@ #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR %p with ref count %d\n", - fmr, fmr->ref_count); + pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n", + fmr, fmr->ref_count); } #endif } @@ -167,7 +166,7 @@ ret = ib_unmap_fmr(&fmr_list); if (ret) - printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); + pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); spin_lock_irq(&pool->pool_lock); list_splice(&unmap_list, &pool->free_list); @@ -212,7 +211,6 @@ { struct ib_device *device; struct ib_fmr_pool *pool; - struct ib_device_attr *attr; int i; int ret; int max_remaps; @@ -223,39 +221,20 @@ device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - printk(KERN_INFO PFX "Device %s does not support FMRs\n", - device->name); + pr_info(PFX "Device %s does not support FMRs\n", device->name); return ERR_PTR(-ENOSYS); } - attr = kmalloc(sizeof *attr, GFP_KERNEL); - if (!attr) { - printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); - return ERR_PTR(-ENOMEM); - } - - ret = ib_query_device(device, attr); - if (ret) { - printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); - kfree(attr); - return ERR_PTR(ret); - } - - if (!attr->max_map_per_fmr) + if (!device->attrs.max_map_per_fmr) max_remaps = IB_FMR_MAX_REMAPS; else - max_remaps = attr->max_map_per_fmr; - - kfree(attr); + max_remaps = device->attrs.max_map_per_fmr; pool = kmalloc(sizeof *pool, GFP_KERNEL); - if (!pool) { - printk(KERN_WARNING PFX "couldn't allocate pool struct\n"); + if (!pool) return ERR_PTR(-ENOMEM); - } pool->cache_bucket = NULL; - pool->flush_function = params->flush_function; pool->flush_arg = params->flush_arg; @@ -267,7 +246,7 @@ kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket, GFP_KERNEL); if (!pool->cache_bucket) { - printk(KERN_WARNING PFX "Failed to allocate cache in pool\n"); + pr_warn(PFX "Failed to allocate cache in pool\n"); ret = -ENOMEM; goto out_free_pool; } @@ -291,7 +270,7 @@ "ib_fmr(%s)", device->name); if (IS_ERR(pool->thread)) { - printk(KERN_WARNING PFX "couldn't start cleanup thread\n"); + pr_warn(PFX "couldn't start cleanup thread\n"); ret = PTR_ERR(pool->thread); goto out_free_pool; } @@ -310,11 +289,8 @@ for (i = 0; i < params->pool_size; ++i) { fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); - if (!fmr) { - printk(KERN_WARNING PFX "failed to allocate fmr " - "struct for FMR %d\n", i); + if (!fmr) goto out_fail; - } fmr->pool = pool; fmr->remap_count = 0; @@ -323,8 +299,8 @@ fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); if (IS_ERR(fmr->fmr)) { - printk(KERN_WARNING PFX "fmr_create failed " - "for FMR %d\n", i); + pr_warn(PFX "fmr_create failed for FMR %d\n", + i); kfree(fmr); goto out_fail; } @@ -379,8 +355,8 @@ } if (i < pool->pool_size) - printk(KERN_WARNING PFX "pool still has %d regions registered\n", - pool->pool_size - i); + pr_warn(PFX "pool still has %d regions registered\n", + pool->pool_size - i); kfree(pool->cache_bucket); kfree(pool); @@ -479,7 +455,7 @@ list_add(&fmr->list, &pool->free_list); spin_unlock_irqrestore(&pool->pool_lock, flags); - printk(KERN_WARNING PFX "fmr_map returns %d\n", result); + pr_warn(PFX "fmr_map returns %d\n", result); return ERR_PTR(result); } @@ -533,8 +509,8 @@ #ifdef DEBUG if (fmr->ref_count < 0) - printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n", - fmr, fmr->ref_count); + pr_warn(PFX "FMR %p has ref count %d < 0\n", + fmr, fmr->ref_count); #endif spin_unlock_irqrestore(&pool->pool_lock, flags); Index: sys/ofed/drivers/infiniband/core/ib_iwcm.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_iwcm.c +++ sys/ofed/drivers/infiniband/core/ib_iwcm.c @@ -5,7 +5,6 @@ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -36,8 +35,6 @@ * SOFTWARE. * */ -#include "opt_inet.h" - #include #include #include @@ -49,13 +46,10 @@ #include #include #include -#include -#include -#include -#include #include #include +#include #include "iwcm.h" @@ -71,84 +65,8 @@ struct iw_cm_event event; struct list_head free_list; }; -struct iwcm_listen_work { - struct work_struct work; - struct iw_cm_id *cm_id; -}; - -static LIST_HEAD(listen_port_list); - -static DEFINE_MUTEX(listen_port_mutex); - -struct listen_port_info { - struct list_head list; - uint16_t port_num; - uint32_t refcnt; -}; - -static int32_t -add_port_to_listenlist(uint16_t port) -{ - struct listen_port_info *port_info; - int err = 0; - - mutex_lock(&listen_port_mutex); - - list_for_each_entry(port_info, &listen_port_list, list) - if (port_info->port_num == port) - goto found_port; - - port_info = kmalloc(sizeof(*port_info), GFP_KERNEL); - if (!port_info) { - err = -ENOMEM; - mutex_unlock(&listen_port_mutex); - goto out; - } - - port_info->port_num = port; - port_info->refcnt = 0; - list_add(&port_info->list, &listen_port_list); - -found_port: - ++(port_info->refcnt); - mutex_unlock(&listen_port_mutex); - return port_info->refcnt; -out: - return err; -} - -static int32_t -rem_port_from_listenlist(uint16_t port) -{ - struct listen_port_info *port_info; - int ret, found_port = 0; - - mutex_lock(&listen_port_mutex); - - list_for_each_entry(port_info, &listen_port_list, list) - if (port_info->port_num == port) { - found_port = 1; - break; - } - - if (found_port) { - --(port_info->refcnt); - ret = port_info->refcnt; - if (port_info->refcnt == 0) { - /* Remove this entry from the list as there are no - * more listeners for this port_num. - */ - list_del(&port_info->list); - kfree(port_info); - } - } else { - ret = -EINVAL; - } - mutex_unlock(&listen_port_mutex); - return ret; - -} +static unsigned int default_backlog = 256; /* * The following services provide a mechanism for pre-allocating iwcm_work @@ -241,15 +159,14 @@ /* * Release a reference on cm_id. If the last reference is being - * released, enable the waiting thread (in iw_destroy_cm_id) to - * get woken up, and return 1 if a thread is already waiting. + * released, free the cm_id and return 1. */ static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { BUG_ON(atomic_read(&cm_id_priv->refcount)==0); if (atomic_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); - complete(&cm_id_priv->destroy_comp); + free_cm_id(cm_id_priv); return 1; } @@ -266,25 +183,15 @@ static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; - int cb_destroy; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - /* - * Test bit before deref in case the cm_id gets freed on another - * thread. - */ - cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - if (iwcm_deref_id(cm_id_priv) && cb_destroy) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); - free_cm_id(cm_id_priv); - } + (void)iwcm_deref_id(cm_id_priv); } static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id *iw_create_cm_id(struct ib_device *device, - struct socket *so, iw_cm_handler cm_handler, void *context) { @@ -301,7 +208,6 @@ cm_id_priv->id.event_handler = cm_event_handler; cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; - cm_id_priv->id.so = so; spin_lock_init(&cm_id_priv->lock); atomic_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); @@ -411,154 +317,6 @@ } EXPORT_SYMBOL(iw_cm_disconnect); -static struct socket * -dequeue_socket(struct socket *head) -{ - struct socket *so; - struct sockaddr_in *remote; - - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (!so) { - ACCEPT_UNLOCK(); - return NULL; - } - - SOCK_LOCK(so); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - soref(so); - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - remote = NULL; - soaccept(so, (struct sockaddr **)&remote); - - free(remote, M_SONAME); - return so; -} -static void -iw_so_event_handler(struct work_struct *_work) -{ -#ifdef INET - struct iwcm_listen_work *work = container_of(_work, - struct iwcm_listen_work, work); - struct iw_cm_id *listen_cm_id = work->cm_id; - struct iwcm_id_private *cm_id_priv; - struct iw_cm_id *real_cm_id; - struct sockaddr_in *local; - struct socket *so; - - cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id); - - if (cm_id_priv->state != IW_CM_STATE_LISTEN) { - kfree(work); - return; - } - - /* Dequeue & process all new 'so' connection requests for this cmid */ - while ((so = dequeue_socket(work->cm_id->so)) != NULL) { - if (rdma_cma_any_addr((struct sockaddr *) - &listen_cm_id->local_addr)) { - in_getsockaddr(so, (struct sockaddr **)&local); - if (rdma_find_cmid_laddr(local, ARPHRD_ETHER, - (void **) &real_cm_id)) { - free(local, M_SONAME); - goto err; - } - free(local, M_SONAME); - - real_cm_id->device->iwcm->newconn(real_cm_id, so); - } else { - listen_cm_id->device->iwcm->newconn(listen_cm_id, so); - } - } -err: - kfree(work); -#endif - return; -} -static int -iw_so_upcall(struct socket *parent_so, void *arg, int waitflag) -{ - struct iwcm_listen_work *work; - struct socket *so; - struct iw_cm_id *cm_id = arg; - - /* check whether iw_so_event_handler() already dequeued this 'so' */ - so = TAILQ_FIRST(&parent_so->so_comp); - if (!so) - return SU_OK; - work = kzalloc(sizeof(*work), M_NOWAIT); - if (!work) - return -ENOMEM; - work->cm_id = cm_id; - - INIT_WORK(&work->work, iw_so_event_handler); - queue_work(iwcm_wq, &work->work); - - return SU_OK; -} - -static void -iw_init_sock(struct iw_cm_id *cm_id) -{ - struct sockopt sopt; - struct socket *so = cm_id->so; - int on = 1; - - SOCK_LOCK(so); - soupcall_set(so, SO_RCV, iw_so_upcall, cm_id); - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = IPPROTO_TCP; - sopt.sopt_name = TCP_NODELAY; - sopt.sopt_val = (caddr_t)&on; - sopt.sopt_valsize = sizeof(on); - sopt.sopt_td = NULL; - sosetopt(so, &sopt); -} - -static int -iw_uninit_socket(struct iw_cm_id *cm_id) -{ - struct socket *so = cm_id->so; - - SOCK_LOCK(so); - soupcall_clear(so, SO_RCV); - SOCK_UNLOCK(so); - - return (0); -} - -static int -iw_create_listen(struct iw_cm_id *cm_id, int backlog) -{ - int rc; - - iw_init_sock(cm_id); - rc = -solisten(cm_id->so, backlog, curthread); - if (rc != 0) - iw_uninit_socket(cm_id); - return (rc); -} - -static int -iw_destroy_listen(struct iw_cm_id *cm_id) -{ - - return (iw_uninit_socket(cm_id)); -} - - /* * CM_ID <-- DESTROYING * @@ -569,7 +327,6 @@ { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret = 0, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* @@ -579,23 +336,19 @@ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); + /* + * Since we're deleting the cm_id, drop any events that + * might arrive before the last dereference. + */ + set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags); + spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { - refcnt = - rem_port_from_listenlist(cm_id->local_addr.sin_port); - - if (refcnt == 0) - ret = iw_destroy_listen(cm_id); - - cm_id->device->iwcm->destroy_listen_ep(cm_id); - } else { - ret = iw_destroy_listen(cm_id); - cm_id->device->iwcm->destroy_listen_ep(cm_id); - } + /* destroy the listening endpoint */ + cm_id->device->iwcm->destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: @@ -647,18 +400,28 @@ struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)); - destroy_cm_id(cm_id); +} +EXPORT_SYMBOL(iw_destroy_cm_id); - wait_for_completion(&cm_id_priv->destroy_comp); - - if (cm_id->so) - sock_release(cm_id->so); +/** + * iw_cm_map - Use portmapper to map the ports + * @cm_id: connection manager pointer + * @active: Indicates the active side when true + * returns nonzero for error only if iwpm_create_mapinfo() fails + * + * Tries to add a mapping for a port using the Portmapper. If + * successful in mapping the IP/Port it will check the remote + * mapped IP address for a wildcard IP address and replace the + * zero IP address with the remote_addr. + */ +static int iw_cm_map(struct iw_cm_id *cm_id, bool active) +{ + cm_id->m_local_addr = cm_id->local_addr; + cm_id->m_remote_addr = cm_id->remote_addr; - free_cm_id(cm_id_priv); + return 0; } -EXPORT_SYMBOL(iw_destroy_cm_id); /* * CM_ID <-- LISTEN @@ -670,10 +433,13 @@ { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret, refcnt; + int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + if (!backlog) + backlog = default_backlog; + ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; @@ -683,33 +449,11 @@ case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - - if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { - refcnt = - add_port_to_listenlist(cm_id->local_addr.sin_port); - - if (refcnt == 1) { - ret = iw_create_listen(cm_id, backlog); - } else if (refcnt <= 0) { - ret = -EINVAL; - } else { - /* if refcnt > 1, a socket listener created - * already. And we need not create socket - * listener on other rdma devices/listen cm_id's - * due to TOE. That is when a socket listener is - * created with INADDR_ANY all registered TOE - * devices will get a call to start - * hardware listeners. - */ - } - } else { - ret = iw_create_listen(cm_id, backlog); - } + ret = iw_cm_map(cm_id, false); if (!ret) - cm_id->device->iwcm->create_listen_ep(cm_id, backlog); - else + ret = cm_id->device->iwcm->create_listen(cm_id, backlog); + if (ret) cm_id_priv->state = IW_CM_STATE_IDLE; - spin_lock_irqsave(&cm_id_priv->lock, flags); break; default: @@ -837,39 +581,37 @@ spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_IDLE) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - return -EINVAL; + ret = -EINVAL; + goto err; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - return -EINVAL; + ret = -EINVAL; + goto err; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->connect(cm_id, iw_param); - if (ret) { - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id_priv->qp) { - cm_id->device->iwcm->rem_ref(qp); - cm_id_priv->qp = NULL; - } - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); - cm_id_priv->state = IW_CM_STATE_IDLE; - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - } + ret = iw_cm_map(cm_id, true); + if (!ret) + ret = cm_id->device->iwcm->connect(cm_id, iw_param); + if (!ret) + return 0; /* success */ + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->qp) { + cm_id->device->iwcm->rem_ref(qp); + cm_id_priv->qp = NULL; + } + cm_id_priv->state = IW_CM_STATE_IDLE; +err: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_connect); @@ -904,7 +646,6 @@ BUG_ON(iw_event->status); cm_id = iw_create_cm_id(listen_id_priv->id.device, - iw_event->so, listen_id_priv->id.cm_handler, listen_id_priv->id.context); /* If the cm_id could not be created, ignore the request */ @@ -912,9 +653,10 @@ goto out; cm_id->provider_data = iw_event->provider_data; - cm_id->local_addr = iw_event->local_addr; + cm_id->m_local_addr = iw_event->local_addr; + cm_id->m_remote_addr = iw_event->remote_addr; + cm_id->local_addr = listen_id_priv->id.local_addr; cm_id->remote_addr = iw_event->remote_addr; - cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; @@ -942,10 +684,7 @@ ret = cm_id->cm_handler(cm_id, iw_event); if (ret) { iw_cm_reject(cm_id, NULL, 0); - set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - destroy_cm_id(cm_id); - if (atomic_read(&cm_id_priv->refcount)==0) - free_cm_id(cm_id_priv); + iw_destroy_cm_id(cm_id); } out: @@ -1009,8 +748,10 @@ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); if (iw_event->status == 0) { - cm_id_priv->id.local_addr = iw_event->local_addr; - cm_id_priv->id.remote_addr = iw_event->remote_addr; + cm_id_priv->id.m_local_addr = iw_event->local_addr; + cm_id_priv->id.m_remote_addr = iw_event->remote_addr; + iw_event->local_addr = cm_id_priv->id.local_addr; + iw_event->remote_addr = cm_id_priv->id.remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ @@ -1131,7 +872,6 @@ unsigned long flags; int empty; int ret = 0; - int destroy_id; spin_lock_irqsave(&cm_id_priv->lock, flags); empty = list_empty(&cm_id_priv->work_list); @@ -1144,20 +884,14 @@ put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = process_event(cm_id_priv, &levent); - if (ret) { - set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - destroy_cm_id(&cm_id_priv->id); - } - BUG_ON(atomic_read(&cm_id_priv->refcount)==0); - destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); - if (iwcm_deref_id(cm_id_priv)) { - if (destroy_id) { - BUG_ON(!list_empty(&cm_id_priv->work_list)); - free_cm_id(cm_id_priv); - } + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { + ret = process_event(cm_id_priv, &levent); + if (ret) + destroy_cm_id(&cm_id_priv->id); + } else + pr_debug("dropping event %d\n", levent.event); + if (iwcm_deref_id(cm_id_priv)) return; - } if (empty) return; spin_lock_irqsave(&cm_id_priv->lock, flags); @@ -1300,7 +1034,7 @@ static int __init iw_cm_init(void) { - iwcm_wq = create_singlethread_workqueue("iw_cm_wq"); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) return -ENOMEM; Index: sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +static int iwpm_user_pid = IWPM_PID_UNDEFINED; + +int iwpm_valid_pid(void) +{ + return iwpm_user_pid > 0; +} +EXPORT_SYMBOL(iwpm_valid_pid); + Index: sys/ofed/drivers/infiniband/core/ib_iwpm_util.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_iwpm_util.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +#define IWPM_MAPINFO_HASH_SIZE 512 +#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) +#define IWPM_REMINFO_HASH_SIZE 64 +#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) +#define IWPM_MSG_SIZE 512 + +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr) +{ + if (a_sockaddr->ss_family != b_sockaddr->ss_family) + return 1; + if (a_sockaddr->ss_family == AF_INET) { + struct sockaddr_in *a4_sockaddr = + (struct sockaddr_in *)a_sockaddr; + struct sockaddr_in *b4_sockaddr = + (struct sockaddr_in *)b_sockaddr; + if (!memcmp(&a4_sockaddr->sin_addr, + &b4_sockaddr->sin_addr, sizeof(struct in_addr)) + && a4_sockaddr->sin_port == b4_sockaddr->sin_port) + return 0; + + } else if (a_sockaddr->ss_family == AF_INET6) { + struct sockaddr_in6 *a6_sockaddr = + (struct sockaddr_in6 *)a_sockaddr; + struct sockaddr_in6 *b6_sockaddr = + (struct sockaddr_in6 *)b_sockaddr; + if (!memcmp(&a6_sockaddr->sin6_addr, + &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) + && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) + return 0; + + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + } + return 1; +} + +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", + msg, &sockaddr_v4->sin_addr, + ntohs(sockaddr_v4->sin_port), + ntohs(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", + msg, &sockaddr_v6->sin6_addr, + ntohs(sockaddr_v6->sin6_port), + ntohs(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} + Index: sys/ofed/drivers/infiniband/core/ib_mad.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_mad.c +++ sys/ofed/drivers/infiniband/core/ib_mad.c @@ -3,6 +3,7 @@ * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,22 +36,21 @@ */ #define LINUXKPI_PARAM_PREFIX ibcore_ +#define KBUILD_MODNAME "ibcore" + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include -#include #include #include "mad_priv.h" #include "mad_rmpp.h" #include "smi.h" +#include "opa_smi.h" #include "agent.h" - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("kernel IB MAD API"); -MODULE_AUTHOR("Hal Rosenstock"); -MODULE_AUTHOR("Sean Hefty"); +#include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -60,29 +60,9 @@ module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -static struct kmem_cache *ib_mad_cache; - static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; - -/* - * Timeout FIFO (tf) param - */ -enum { - /* min time between 2 consecutive activations of tf workqueue */ - MIN_BETWEEN_ACTIVATIONS_MS = 5 -}; - -/* - * SA congestion control params - */ -enum { - MAX_OUTSTANDING_SA_MADS = 10, - MIN_TIME_FOR_SA_MAD_SEND_MS = 20, - MAX_SA_MADS = 10000 -}; - /* Port list lock */ static DEFINE_SPINLOCK(ib_mad_port_list_lock); @@ -92,7 +72,7 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *priv); static struct ib_mad_agent_private *find_mad_agent( struct ib_mad_port_private *port_priv, - struct ib_mad *mad); + const struct ib_mad_hdr *mad); static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad); static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv); @@ -103,512 +83,9 @@ u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); -static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, - u32 timeout_ms, u32 retries_left); - - -/* - * Timeout FIFO functions - implements FIFO with timeout mechanism - */ - -static void activate_timeout_handler_task(unsigned long data) -{ - struct to_fifo *tf; - - tf = (struct to_fifo *)data; - del_timer(&tf->timer); - queue_work(tf->workq, &tf->work); -} - -static unsigned long adjusted_time(unsigned long last, unsigned long next) -{ - unsigned long min_next; - - min_next = last + msecs_to_jiffies(MIN_BETWEEN_ACTIVATIONS_MS); - if (time_after(min_next, next)) - return min_next; - - return next; -} - -static void notify_failure(struct ib_mad_send_wr_private *mad_send_wr, - enum ib_wc_status status) -{ - struct ib_mad_send_wc mad_send_wc; - struct ib_mad_agent_private *mad_agent_priv; - - mad_send_wc.status = status; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - mad_agent_priv = mad_send_wr->mad_agent_priv; - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); -} - -static inline struct sa_cc_data * -get_cc_obj(struct ib_mad_send_wr_private *mad_send_wr) -{ - return &mad_send_wr->mad_agent_priv->qp_info->port_priv->sa_cc; -} - -static inline struct ib_mad_send_wr_private *tfe_to_mad(struct tf_entry *tfe) -{ - return container_of(tfe, struct ib_mad_send_wr_private, tf_list); -} - -static void timeout_handler_task(struct work_struct *work) -{ - struct tf_entry *tmp1, *tmp2; - struct list_head *list_item, exp_lst; - unsigned long flags, curr_time; - int lst_empty; - struct to_fifo *tf; - - tf = container_of(work, struct to_fifo, work); - do { - INIT_LIST_HEAD(&exp_lst); - - spin_lock_irqsave(&tf->lists_lock, flags); - curr_time = jiffies; - list_for_each(list_item, &tf->to_head) { - tmp1 = list_entry(list_item, struct tf_entry, to_list); - if (time_before(curr_time, tmp1->exp_time)) - break; - list_del(&tmp1->fifo_list); - tf->num_items--; - } - - /* cut list up to and including list_item->prev */ - list_cut_position(&exp_lst, &tf->to_head, list_item->prev); - spin_unlock_irqrestore(&tf->lists_lock, flags); - - lst_empty = list_empty(&exp_lst); - list_for_each_entry_safe(tmp1, tmp2, &exp_lst, to_list) { - list_del(&tmp1->to_list); - if (tmp1->canceled) { - tmp1->canceled = 0; - notify_failure(tfe_to_mad(tmp1), IB_WC_WR_FLUSH_ERR); - } else { - notify_failure(tfe_to_mad(tmp1), IB_WC_RESP_TIMEOUT_ERR); - } - } - } while (!lst_empty); - - spin_lock_irqsave(&tf->lists_lock, flags); - if (!list_empty(&tf->to_head)) { - tmp1 = list_entry(tf->to_head.next, struct tf_entry, to_list); - mod_timer(&tf->timer, adjusted_time(curr_time, tmp1->exp_time)); - } - spin_unlock_irqrestore(&tf->lists_lock, flags); -} - -/** - * tf_create - creates new timeout-fifo object - * @fifo_size: Maximum fifo size - * - * Allocate and initialize new timeout-fifo object - */ -static struct to_fifo *tf_create(u32 fifo_size) -{ - struct to_fifo *tf; - - tf = kzalloc(sizeof(*tf), GFP_KERNEL); - if (tf) { - tf->workq = create_singlethread_workqueue("to_fifo"); - if (!tf->workq) { - kfree(tf); - return NULL; - } - spin_lock_init(&tf->lists_lock); - INIT_LIST_HEAD(&tf->to_head); - INIT_LIST_HEAD(&tf->fifo_head); - init_timer(&tf->timer); - INIT_WORK(&tf->work, timeout_handler_task); - tf->timer.data = (unsigned long) tf; - tf->timer.function = activate_timeout_handler_task; - tf->timer.expires = jiffies; - tf->fifo_size = fifo_size; - tf->stop_enqueue = 0; - tf->num_items = 0; - } - - return tf; -} - -/** - * tf_enqueue - enqueue item to timeout-fifo object - * @tf:timeout-fifo object - * @item: item to enqueue. - * @timeout_ms: item expiration time in ms. - * - * Enqueue item to fifo and modify expiration timer when required. - * - * Returns 0 on success and negative on failure. - */ -static int tf_enqueue(struct to_fifo *tf, struct tf_entry *item, u32 timeout_ms) -{ - struct tf_entry *tmp; - struct list_head *list_item; - unsigned long flags; - - item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); - - spin_lock_irqsave(&tf->lists_lock, flags); - if (tf->num_items >= tf->fifo_size || tf->stop_enqueue) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return -EBUSY; - } - - /* Insert item to timeout list */ - list_for_each_prev(list_item, &tf->to_head) { - tmp = list_entry(list_item, struct tf_entry, to_list); - if (time_after(item->exp_time, tmp->exp_time)) - break; - } - - list_add(&item->to_list, list_item); - - /* Insert item to fifo list */ - list_add_tail(&item->fifo_list, &tf->fifo_head); - - tf->num_items++; - - /* modify expiration timer if required */ - if (list_item == &tf->to_head) - mod_timer(&tf->timer, item->exp_time); - - spin_unlock_irqrestore(&tf->lists_lock, flags); - - return 0; -} - -/** - * tf_dequeue - dequeue item from timeout-fifo object - * @tf:timeout-fifo object - * @time_left_ms: returns the time left for expiration in ms. - * - * Dequeue item from fifo and modify expiration timer when required. - * - * Returns pointer to tf_entry on success and NULL on failure. - */ -static struct tf_entry *tf_dequeue(struct to_fifo *tf, u32 *time_left_ms) -{ - unsigned long flags; - unsigned long time_left; - struct tf_entry *tmp, *tmp1; - bool found = false; - - spin_lock_irqsave(&tf->lists_lock, flags); - if (list_empty(&tf->fifo_head)) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return NULL; - } - - list_for_each_entry(tmp, &tf->fifo_head, fifo_list) { - if (!tmp->canceled) { - found = true; - break; - } - } - - if (!found) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return NULL; - } - - /* modify timer in case enqueued item is the next to expire */ - if (tf->to_head.next == &tmp->to_list) { - if (list_is_last(&tmp->to_list, &tf->to_head)) { - del_timer(&tf->timer); - } else { - tmp1 = list_entry(tmp->to_list.next, struct tf_entry, to_list); - mod_timer(&tf->timer, tmp1->exp_time); - } - } - list_del(&tmp->fifo_list); - list_del(&tmp->to_list); - tf->num_items--; - spin_unlock_irqrestore(&tf->lists_lock, flags); - - time_left = tmp->exp_time - jiffies; - if ((long) time_left <= 0) - time_left = 0; - *time_left_ms = jiffies_to_msecs(time_left); - - return tmp; -} - -static void tf_stop_enqueue(struct to_fifo *tf) -{ - unsigned long flags; - - spin_lock_irqsave(&tf->lists_lock, flags); - tf->stop_enqueue = 1; - spin_unlock_irqrestore(&tf->lists_lock, flags); -} - -/** - * tf_free - free empty timeout-fifo object - * @tf:timeout-fifo object - * - */ -static void tf_free(struct to_fifo *tf) -{ - del_timer_sync(&tf->timer); - flush_workqueue(tf->workq); - destroy_workqueue(tf->workq); - - kfree(tf); -} - -/** - * tf_free_agent - free MADs related to specific MAD agent from timeout-fifo - * @tf:timeout-fifo object - * @mad_agent_priv: MAD agent. - * - */ -static void tf_free_agent(struct to_fifo *tf, struct ib_mad_agent_private *mad_agent_priv) -{ - unsigned long flags; - struct tf_entry *tmp, *tmp1; - struct list_head tmp_head; - - INIT_LIST_HEAD(&tmp_head); - spin_lock_irqsave(&tf->lists_lock, flags); - list_for_each_entry_safe(tmp, tmp1, &tf->fifo_head, fifo_list) { - if (tfe_to_mad(tmp)->mad_agent_priv == mad_agent_priv) { - list_del(&tmp->to_list); - list_move(&tmp->fifo_list, &tmp_head); - tf->num_items--; - } - } - spin_unlock_irqrestore(&tf->lists_lock, flags); - - list_for_each_entry_safe(tmp, tmp1, &tmp_head, fifo_list) { - list_del(&tmp->fifo_list); - notify_failure(tfe_to_mad(tmp), IB_WC_WR_FLUSH_ERR); - } -} - -/** - * tf_modify_item - to modify expiration time for specific item - * @tf:timeout-fifo object - * @mad_agent_priv: MAD agent. - * @send_buf: the MAD to modify in queue - * @timeout_ms: new timeout to set. - * - * Returns 0 if item found on list and -ENXIO if not. - * - * Note: The send_buf may point on MAD that is already released. - * Therefore we can't use this struct before finding it in the list - */ -static int tf_modify_item(struct to_fifo *tf, - struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_send_buf *send_buf, u32 timeout_ms) -{ - struct tf_entry *tmp, *item; - struct list_head *list_item; - unsigned long flags; - int found = 0; - - spin_lock_irqsave(&tf->lists_lock, flags); - list_for_each_entry(item, &tf->fifo_head, fifo_list) { - if (tfe_to_mad(item)->mad_agent_priv == mad_agent_priv && - &tfe_to_mad(item)->send_buf == send_buf) { - found = 1; - break; - } - } - - if (!found) { - spin_unlock_irqrestore(&tf->lists_lock, flags); - return -ENXIO; - } - - item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); - - if (timeout_ms) { - list_del(&item->to_list); - list_for_each_prev(list_item, &tf->to_head) { - tmp = list_entry(list_item, struct tf_entry, to_list); - if (time_after(item->exp_time, tmp->exp_time)) - break; - } - list_add(&item->to_list, list_item); - - /* modify expiration timer if required */ - if (list_item == &tf->to_head) - mod_timer(&tf->timer, item->exp_time); - } else { - /* - * when item canceled (timeout_ms == 0) move item to - * head of timeout list and to the tail of fifo list - */ - item->canceled = 1; - list_move(&item->to_list, &tf->to_head); - list_move_tail(&item->fifo_list, &tf->fifo_head); - mod_timer(&tf->timer, item->exp_time); - } - spin_unlock_irqrestore(&tf->lists_lock, flags); - - return 0; -} - -/* - * SA congestion control functions - */ - -/* - * Defines which MAD is under congestion control. - */ -static int is_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr) -{ - struct ib_mad_hdr *mad; - - mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad; - - return ((mad_send_wr->send_buf.timeout_ms) && - (mad->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) && - ((mad->method == IB_MGMT_METHOD_GET) || - (mad->method == IB_MGMT_METHOD_SET))); -} - -/* - * Notify that SA congestion controlled MAD is done. - * to allow dequeuing SA MAD from congestion control queue. - */ -static void sa_cc_mad_done(struct sa_cc_data *cc_obj) -{ - unsigned long flags; - struct tf_entry *tfe; - struct ib_mad_send_wr_private *mad_send_wr; - u32 time_left_ms, timeout_ms, retries; - int ret; - - do { - spin_lock_irqsave(&cc_obj->lock, flags); - tfe = tf_dequeue(cc_obj->tf, &time_left_ms); - if (!tfe) { - if (cc_obj->outstanding > 0) - cc_obj->outstanding--; - spin_unlock_irqrestore(&cc_obj->lock, flags); - break; - } - spin_unlock_irqrestore(&cc_obj->lock, flags); - mad_send_wr = tfe_to_mad(tfe); - time_left_ms += MIN_TIME_FOR_SA_MAD_SEND_MS; - if (time_left_ms > mad_send_wr->send_buf.timeout_ms) { - retries = time_left_ms / mad_send_wr->send_buf.timeout_ms - 1; - timeout_ms = mad_send_wr->send_buf.timeout_ms; - } else { - retries = 0; - timeout_ms = time_left_ms; - } - ret = send_sa_cc_mad(mad_send_wr, timeout_ms, retries); - if (ret) { - if (ret == -ENOMEM) - notify_failure(mad_send_wr, IB_WC_GENERAL_ERR); - else - notify_failure(mad_send_wr, IB_WC_LOC_QP_OP_ERR); - } - } while (ret); -} - -/* - * Send SA MAD under congestion control. - */ -static int sa_cc_mad_send(struct ib_mad_send_wr_private *mad_send_wr) -{ - unsigned long flags; - int ret; - struct sa_cc_data *cc_obj; - - cc_obj = get_cc_obj(mad_send_wr); - spin_lock_irqsave(&cc_obj->lock, flags); - if (cc_obj->outstanding < MAX_OUTSTANDING_SA_MADS) { - cc_obj->outstanding++; - spin_unlock_irqrestore(&cc_obj->lock, flags); - ret = send_sa_cc_mad(mad_send_wr, mad_send_wr->send_buf.timeout_ms, - mad_send_wr->retries_left); - if (ret) - sa_cc_mad_done(cc_obj); - - } else { - int qtime = (mad_send_wr->send_buf.timeout_ms * - (mad_send_wr->retries_left + 1)) - - MIN_TIME_FOR_SA_MAD_SEND_MS; - - if (qtime < 0) - qtime = 0; - ret = tf_enqueue(cc_obj->tf, &mad_send_wr->tf_list, (u32)qtime); - - spin_unlock_irqrestore(&cc_obj->lock, flags); - } - - return ret; -} - -/* - * Initialize SA congestion control. - */ -static int sa_cc_init(struct sa_cc_data *cc_obj) -{ - spin_lock_init(&cc_obj->lock); - cc_obj->outstanding = 0; - cc_obj->tf = tf_create(MAX_SA_MADS); - if (!cc_obj->tf) - return -ENOMEM; - return 0; -} - -/* - * Cancel SA MADs from congestion control queue. - */ -static void cancel_sa_cc_mads(struct ib_mad_agent_private *mad_agent_priv) -{ - tf_free_agent(mad_agent_priv->qp_info->port_priv->sa_cc.tf, - mad_agent_priv); -} - -/* - * Modify timeout of SA MAD on congestion control queue. - */ -static int modify_sa_cc_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_send_buf *send_buf, u32 timeout_ms) -{ - int ret; - int qtime = 0; - - if (timeout_ms > MIN_TIME_FOR_SA_MAD_SEND_MS) - qtime = timeout_ms - MIN_TIME_FOR_SA_MAD_SEND_MS; - - ret = tf_modify_item(mad_agent_priv->qp_info->port_priv->sa_cc.tf, - mad_agent_priv, send_buf, (u32)qtime); - return ret; -} - -static void sa_cc_destroy(struct sa_cc_data *cc_obj) -{ - struct ib_mad_send_wr_private *mad_send_wr; - struct tf_entry *tfe; - struct ib_mad_send_wc mad_send_wc; - struct ib_mad_agent_private *mad_agent_priv; - u32 time_left_ms; - - mad_send_wc.status = IB_WC_WR_FLUSH_ERR; - mad_send_wc.vendor_err = 0; - - tf_stop_enqueue(cc_obj->tf); - tfe = tf_dequeue(cc_obj->tf, &time_left_ms); - while (tfe) { - mad_send_wr = tfe_to_mad(tfe); - mad_send_wc.send_buf = &mad_send_wr->send_buf; - mad_agent_priv = mad_send_wr->mad_agent_priv; - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); - tfe = tf_dequeue(cc_obj->tf, &time_left_ms); - } - tf_free(cc_obj->tf); -} +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc); +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); /* * Returns a ib_mad_port_private structure or NULL for a device/port @@ -704,12 +181,12 @@ return 0; } -int ib_response_mad(struct ib_mad *mad) +int ib_response_mad(const struct ib_mad_hdr *hdr) { - return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) || - (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || - ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) && - (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP))); + return ((hdr->method & IB_MGMT_METHOD_RESP) || + (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) || + ((hdr->mgmt_class == IB_MGMT_CLASS_BM) && + (hdr->attr_mod & IB_BM_ATTR_MOD_RESP))); } EXPORT_SYMBOL(ib_response_mad); @@ -723,7 +200,8 @@ u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, - void *context) + void *context, + u32 registration_flags) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret = ERR_PTR(-EINVAL); @@ -739,68 +217,109 @@ /* Validate parameters */ qpn = get_spl_qp_index(qp_type); - if (qpn == -1) + if (qpn == -1) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid QP Type %d\n", + qp_type); goto error1; + } - if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) + if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid RMPP Version %u\n", + rmpp_version); goto error1; + } /* Validate MAD registration request if supplied */ if (mad_reg_req) { - if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) + if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid Class Version %u\n", + mad_reg_req->mgmt_class_version); goto error1; - if (!recv_handler) + } + if (!recv_handler) { + dev_notice(&device->dev, + "ib_register_mad_agent: no recv_handler\n"); goto error1; + } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { /* * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only * one in this range currently allowed */ if (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else if (mad_reg_req->mgmt_class == 0) { /* * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0\n"); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* * If class is in "new" vendor range, * ensure supplied OUI is not zero */ - if (!is_vendor_oui(mad_reg_req->oui)) + if (!is_vendor_oui(mad_reg_req->oui)) { + dev_notice(&device->dev, + "ib_register_mad_agent: No OUI specified for class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { - if (rmpp_version) + if (rmpp_version) { + dev_notice(&device->dev, + "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } + /* Make sure class supplied is consistent with QP type */ if (qp_type == IB_QPT_SMI) { if ((mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else { if ((mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } } else { /* No registration request supplied */ if (!send_handler) goto error1; + if (registration_flags & IB_MAD_USER_RMPP) + goto error1; } /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { + dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n"); ret = ERR_PTR(-ENODEV); goto error1; } @@ -808,6 +327,8 @@ /* Verify the QP requested is supported. For example, Ethernet devices * will not have QP0 */ if (!port_priv->qp_info[qpn].qp) { + dev_notice(&device->dev, + "ib_register_mad_agent: QP %d not supported\n", qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } @@ -819,13 +340,6 @@ goto error1; } - mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(mad_agent_priv->agent.mr)) { - ret = ERR_PTR(-ENOMEM); - goto error2; - } - if (mad_reg_req) { reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); if (!reg_req) { @@ -844,6 +358,7 @@ mad_agent_priv->agent.context = context; mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_agent_priv->agent.port_num = port_num; + mad_agent_priv->agent.flags = registration_flags; spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); @@ -909,8 +424,6 @@ spin_unlock_irqrestore(&port_priv->reg_lock, flags); kfree(reg_req); error3: - ib_dereg_mr(mad_agent_priv->agent.mr); -error2: kfree(mad_agent_priv); error1: return ret; @@ -1056,7 +569,7 @@ */ cancel_mads(mad_agent_priv); port_priv = mad_agent_priv->qp_info->port_priv; - cancel_delayed_work_sync(&mad_agent_priv->timed_work); + cancel_delayed_work(&mad_agent_priv->timed_work); spin_lock_irqsave(&port_priv->reg_lock, flags); remove_mad_reg_req(mad_agent_priv); @@ -1070,7 +583,6 @@ wait_for_completion(&mad_agent_priv->comp); kfree(mad_agent_priv->reg_req); - ib_dereg_mr(mad_agent_priv->agent.mr); kfree(mad_agent_priv); } @@ -1099,7 +611,6 @@ struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; - if (!IS_ERR(mad_agent)) { /* If the TID is zero, the agent can only snoop. */ if (mad_agent->hi_tid) { mad_agent_priv = container_of(mad_agent, @@ -1112,8 +623,6 @@ agent); unregister_mad_snoop(mad_snoop_priv); } - } - return 0; } EXPORT_SYMBOL(ib_unregister_mad_agent); @@ -1174,7 +683,7 @@ atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, + mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, mad_recv_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); @@ -1182,12 +691,11 @@ spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } -static void build_smp_wc(struct ib_qp *qp, - u64 wr_id, u16 slid, u16 pkey_index, u8 port_num, - struct ib_wc *wc) +static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, + u16 pkey_index, u8 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); - wc->wr_id = wr_id; + wc->wr_cqe = cqe; wc->status = IB_WC_SUCCESS; wc->opcode = IB_WC_RECV; wc->pkey_index = pkey_index; @@ -1200,6 +708,32 @@ wc->port_num = port_num; } +static size_t mad_priv_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_mad_private) + mp->mad_size; +} + +static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags) +{ + size_t size = sizeof(struct ib_mad_private) + mad_size; + struct ib_mad_private *ret = kzalloc(size, flags); + + if (ret) + ret->mad_size = mad_size; + + return ret; +} + +static size_t port_mad_size(const struct ib_mad_port_private *port_priv) +{ + return rdma_max_mad_size(port_priv->device, port_priv->port_num); +} + +static size_t mad_priv_dma_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_grh) + mp->mad_size; +} + /* * Return 0 if SMP is to be sent * Return 1 if SMP was consumed locally (whether or not solicited) @@ -1210,6 +744,7 @@ { int ret = 0; struct ib_smp *smp = mad_send_wr->send_buf.mad; + struct opa_smp *opa_smp = (struct opa_smp *)smp; unsigned long flags; struct ib_mad_local_private *local; struct ib_mad_private *mad_priv; @@ -1218,11 +753,16 @@ struct ib_device *device = mad_agent_priv->agent.device; u8 port_num; struct ib_wc mad_wc; - struct ib_send_wr *send_wr = &mad_send_wr->send_wr; - - if (device->node_type == RDMA_NODE_IB_SWITCH && + struct ib_ud_wr *send_wr = &mad_send_wr->send_wr; + size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv); + u16 out_mad_pkey_index = 0; + u16 drslid; + bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); + + if (rdma_cap_ib_switch(device) && smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) - port_num = send_wr->wr.ud.port_num; + port_num = send_wr->port_num; else port_num = mad_agent_priv->agent.port_num; @@ -1232,50 +772,86 @@ * If we are at the start of the LID routed part, don't update the * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. */ - if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) != - IB_LID_PERMISSIVE) - goto out; - if (smi_handle_dr_smp_send(smp, device->node_type, port_num) == - IB_SMI_DISCARD) { - ret = -EINVAL; - printk(KERN_ERR PFX "Invalid directed route\n"); - goto out; - } + if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) { + u32 opa_drslid; + + if ((opa_get_smp_direction(opa_smp) + ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == + OPA_LID_PERMISSIVE && + opa_smi_handle_dr_smp_send(opa_smp, + rdma_cap_ib_switch(device), + port_num) == IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid directed route\n"); + goto out; + } + opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid); + if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) && + opa_drslid & 0xffff0000) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n", + opa_drslid); + goto out; + } + drslid = (u16)(opa_drslid & 0x0000ffff); - /* Check to post send on QP or process locally */ - if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && - smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) - goto out; + /* Check to post send on QP or process locally */ + if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD && + opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) + goto out; + } else { + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == + IB_LID_PERMISSIVE && + smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == + IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "Invalid directed route\n"); + goto out; + } + drslid = be16_to_cpu(smp->dr_slid); + + /* Check to post send on QP or process locally */ + if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && + smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) + goto out; + } local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for ib_mad_local_private\n"); + dev_err(&device->dev, "No memory for ib_mad_local_private\n"); goto out; } local->mad_priv = NULL; local->recv_mad_agent = NULL; - mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); + mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for local response MAD\n"); + dev_err(&device->dev, "No memory for local response MAD\n"); kfree(local); goto out; } build_smp_wc(mad_agent_priv->agent.qp, - send_wr->wr_id, be16_to_cpu(smp->dr_slid), - send_wr->wr.ud.pkey_index, - send_wr->wr.ud.port_num, &mad_wc); + send_wr->wr.wr_cqe, drslid, + send_wr->pkey_index, + send_wr->port_num, &mad_wc); + + if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) { + mad_wc.byte_len = mad_send_wr->send_buf.hdr_len + + mad_send_wr->send_buf.data_len + + sizeof(struct ib_grh); + } /* No GRH for DR SMP */ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL, - (struct ib_mad *)smp, - (struct ib_mad *)&mad_priv->mad); + (const struct ib_mad_hdr *)smp, mad_size, + (struct ib_mad_hdr *)mad_priv->mad, + &mad_size, &out_mad_pkey_index); switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: - if (ib_response_mad(&mad_priv->mad.mad) && + if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) && mad_agent_priv->agent.recv_handler) { local->mad_priv = mad_priv; local->recv_mad_agent = mad_agent_priv; @@ -1285,39 +861,43 @@ */ atomic_inc(&mad_agent_priv->refcount); } else - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; case IB_MAD_RESULT_SUCCESS: /* Treat like an incoming receive MAD */ port_priv = ib_get_mad_port(mad_agent_priv->agent.device, mad_agent_priv->agent.port_num); if (port_priv) { - memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad)); + memcpy(mad_priv->mad, smp, mad_priv->mad_size); recv_mad_agent = find_mad_agent(port_priv, - &mad_priv->mad.mad); + (const struct ib_mad_hdr *)mad_priv->mad); } if (!port_priv || !recv_mad_agent) { /* * No receiving agent so drop packet and * generate send completion. */ - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); break; } local->mad_priv = mad_priv; local->recv_mad_agent = recv_mad_agent; break; default: - kmem_cache_free(ib_mad_cache, mad_priv); + kfree(mad_priv); kfree(local); ret = -EINVAL; goto out; } local->mad_send_wr = mad_send_wr; + if (opa) { + local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index; + local->return_wc_byte_len = mad_size; + } /* Reference MAD agent until send side of local completion handled */ atomic_inc(&mad_agent_priv->refcount); /* Queue local completion to local list */ @@ -1331,11 +911,11 @@ return ret; } -static int get_pad_size(int hdr_len, int data_len) +static int get_pad_size(int hdr_len, int data_len, size_t mad_size) { int seg_size, pad; - seg_size = sizeof(struct ib_mad) - hdr_len; + seg_size = mad_size - hdr_len; if (data_len && seg_size) { pad = seg_size - data_len % seg_size; return pad == seg_size ? 0 : pad; @@ -1354,14 +934,15 @@ } static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, - gfp_t gfp_mask) + size_t mad_size, gfp_t gfp_mask) { struct ib_mad_send_buf *send_buf = &send_wr->send_buf; struct ib_rmpp_mad *rmpp_mad = send_buf->mad; struct ib_rmpp_segment *seg = NULL; int left, seg_size, pad; - send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len; + send_buf->seg_size = mad_size - send_buf->hdr_len; + send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR; seg_size = send_buf->seg_size; pad = send_wr->pad; @@ -1369,9 +950,9 @@ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); if (!seg) { - printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem " - "alloc failed for len %zd, gfp %#x\n", - sizeof (*seg) + seg_size, gfp_mask); + dev_err(&send_buf->mad_agent->device->dev, + "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n", + sizeof (*seg) + seg_size, gfp_mask); free_send_rmpp_list(send_wr); return -ENOMEM; } @@ -1394,33 +975,52 @@ return 0; } +int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent) +{ + return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); +} +EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); + struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, - gfp_t gfp_mask) + gfp_t gfp_mask, + u8 base_version) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; int pad, message_size, ret, size; void *buf; + size_t mad_size; + bool opa; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); - pad = get_pad_size(hdr_len, data_len); + + opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num); + + if (opa && base_version == OPA_MGMT_BASE_VERSION) + mad_size = sizeof(struct opa_mad); + else + mad_size = sizeof(struct ib_mad); + + pad = get_pad_size(hdr_len, data_len, mad_size); message_size = hdr_len + data_len + pad; - if ((!mad_agent->rmpp_version && - (rmpp_active || message_size > sizeof(struct ib_mad))) || - (!rmpp_active && message_size > sizeof(struct ib_mad))) - return ERR_PTR(-EINVAL); + if (ib_mad_kernel_rmpp_agent(mad_agent)) { + if (!rmpp_active && message_size > mad_size) + return ERR_PTR(-EINVAL); + } else + if (rmpp_active || message_size > mad_size) + return ERR_PTR(-EINVAL); - size = rmpp_active ? hdr_len : sizeof(struct ib_mad); + size = rmpp_active ? hdr_len : mad_size; buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); if (!buf) return ERR_PTR(-ENOMEM); - mad_send_wr = buf + size; + mad_send_wr = (struct ib_mad_send_wr_private *)((char *)buf + size); INIT_LIST_HEAD(&mad_send_wr->rmpp_list); mad_send_wr->send_buf.mad = buf; mad_send_wr->send_buf.hdr_len = hdr_len; @@ -1429,21 +1029,30 @@ mad_send_wr->mad_agent_priv = mad_agent_priv; mad_send_wr->sg_list[0].length = hdr_len; - mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; - mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len; - mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; - - mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr; - mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; - mad_send_wr->send_wr.num_sge = 2; - mad_send_wr->send_wr.opcode = IB_WR_SEND; - mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED; - mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn; - mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; - mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index; + mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey; + + /* OPA MADs don't have to be the full 2048 bytes */ + if (opa && base_version == OPA_MGMT_BASE_VERSION && + data_len < mad_size - hdr_len) + mad_send_wr->sg_list[1].length = data_len; + else + mad_send_wr->sg_list[1].length = mad_size - hdr_len; + + mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; + + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; + mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; + mad_send_wr->send_wr.wr.num_sge = 2; + mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; + mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED; + mad_send_wr->send_wr.remote_qpn = remote_qpn; + mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY; + mad_send_wr->send_wr.pkey_index = pkey_index; if (rmpp_active) { - ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask); + ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask); if (ret) { kfree(buf); return ERR_PTR(ret); @@ -1513,7 +1122,7 @@ return ib_get_rmpp_segment(&mad_send_wr->send_buf, mad_send_wr->seg_num); else - return mad_send_wr->send_buf.mad + + return (char *)mad_send_wr->send_buf.mad + mad_send_wr->send_buf.hdr_len; } @@ -1545,8 +1154,9 @@ /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; - mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; @@ -1557,22 +1167,23 @@ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) return -ENOMEM; + mad_send_wr->header_mapping = sge[0].addr; + sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); - if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { - ret = -ENOMEM; - goto dma1_err; + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + return -ENOMEM; } - - mad_send_wr->header_mapping = sge[0].addr; mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { - ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr, + ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); list = &qp_info->send_queue.list; } else { @@ -1585,51 +1196,14 @@ list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); - - if (!ret) - return 0; - + if (ret) { ib_dma_unmap_single(mad_agent->device, mad_send_wr->header_mapping, - sge[1].length, DMA_TO_DEVICE); -dma1_err: + sge[0].length, DMA_TO_DEVICE); ib_dma_unmap_single(mad_agent->device, mad_send_wr->payload_mapping, - sge[0].length, DMA_TO_DEVICE); - return ret; -} - -/* - * Send SA MAD that passed congestion control - */ -static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, - u32 timeout_ms, u32 retries_left) -{ - int ret; - unsigned long flags; - struct ib_mad_agent_private *mad_agent_priv; - - mad_agent_priv = mad_send_wr->mad_agent_priv; - mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); - mad_send_wr->retries_left = retries_left; - mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); - - /* Reference MAD agent until send completes */ - atomic_inc(&mad_agent_priv->refcount); - spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_add_tail(&mad_send_wr->agent_list, - &mad_agent_priv->send_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - - ret = ib_send_mad(mad_send_wr); - if (ret < 0) { - /* Fail send request */ - spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&mad_send_wr->agent_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); + sge[1].length, DMA_TO_DEVICE); } - return ret; } @@ -1674,7 +1248,7 @@ * request associated with the completion */ next_send_buf = send_buf->next; - mad_send_wr->send_wr.wr.ud.ah = send_buf->ah; + mad_send_wr->send_wr.ah = send_buf->ah; if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { @@ -1696,12 +1270,6 @@ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); mad_send_wr->status = IB_WC_SUCCESS; - if (is_sa_cc_mad(mad_send_wr)) { - mad_send_wr->is_sa_cc_mad = 1; - ret = sa_cc_mad_send(mad_send_wr); - if (ret < 0) - goto error; - } else { /* Reference MAD agent until send completes */ atomic_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); @@ -1709,7 +1277,7 @@ &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_send_rmpp_mad(mad_send_wr); if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) ret = ib_send_mad(mad_send_wr); @@ -1724,7 +1292,6 @@ goto error; } } - } return 0; error: if (bad_send_buf) @@ -1756,7 +1323,7 @@ recv_wc); priv = container_of(mad_priv_hdr, struct ib_mad_private, header); - kmem_cache_free(ib_mad_cache, priv); + kfree(priv); } } EXPORT_SYMBOL(ib_free_recv_mad); @@ -1774,7 +1341,8 @@ int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc) { - printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n"); + dev_err(&mad_agent->device->dev, + "ib_process_mad_wc() not implemented yet\n"); return 0; } EXPORT_SYMBOL(ib_process_mad_wc); @@ -1786,7 +1354,7 @@ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { - printk(KERN_ERR PFX "Method %d already in use\n", i); + pr_err("Method %d already in use\n", i); return -EINVAL; } } @@ -1798,8 +1366,7 @@ /* Allocate management method table */ *method = kzalloc(sizeof **method, GFP_ATOMIC); if (!*method) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_method_table\n"); + pr_err("No memory for ib_mad_mgmt_method_table\n"); return -ENOMEM; } @@ -1843,7 +1410,7 @@ } static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class, - char *oui) + const char *oui) { int i; @@ -1894,8 +1461,8 @@ /* Allocate management class table for "new" class version */ *class = kzalloc(sizeof **class, GFP_ATOMIC); if (!*class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_class_table\n"); ret = -ENOMEM; goto error1; } @@ -1961,8 +1528,8 @@ /* Allocate mgmt vendor class table for "new" class version */ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); if (!vendor) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class_table\n"); goto error1; } @@ -1972,8 +1539,8 @@ /* Allocate table for this management vendor class */ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); if (!vendor_class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class\n"); goto error2; } @@ -2004,7 +1571,7 @@ goto check_in_use; } } - printk(KERN_ERR PFX "All OUI slots in use\n"); + dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); goto error3; check_in_use: @@ -2074,9 +1641,9 @@ /* Now, check to see if there are any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ - kfree(method); - class->method_table[mgmt_class] = NULL; - /* Any management classes left ? */ + kfree(method); + class->method_table[mgmt_class] = NULL; + /* Any management classes left ? */ if (!check_class_table(class)) { /* If not, release management class table */ kfree(class); @@ -2141,13 +1708,13 @@ static struct ib_mad_agent_private * find_mad_agent(struct ib_mad_port_private *port_priv, - struct ib_mad *mad) + const struct ib_mad_hdr *mad_hdr) { struct ib_mad_agent_private *mad_agent = NULL; unsigned long flags; spin_lock_irqsave(&port_priv->reg_lock, flags); - if (ib_response_mad(mad)) { + if (ib_response_mad(mad_hdr)) { u32 hi_tid; struct ib_mad_agent_private *entry; @@ -2155,7 +1722,7 @@ * Routing is based on high 32 bits of transaction ID * of MAD. */ - hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32; + hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; list_for_each_entry(entry, &port_priv->agent_list, agent_list) { if (entry->agent.hi_tid == hi_tid) { mad_agent = entry; @@ -2167,45 +1734,45 @@ struct ib_mad_mgmt_method_table *method; struct ib_mad_mgmt_vendor_class_table *vendor; struct ib_mad_mgmt_vendor_class *vendor_class; - struct ib_vendor_mad *vendor_mad; + const struct ib_vendor_mad *vendor_mad; int index; /* * Routing is based on version, class, and method * For "newer" vendor MADs, also based on OUI */ - if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION) + if (mad_hdr->class_version >= MAX_MGMT_VERSION) goto out; - if (!is_vendor_class(mad->mad_hdr.mgmt_class)) { + if (!is_vendor_class(mad_hdr->mgmt_class)) { class = port_priv->version[ - mad->mad_hdr.class_version].class; + mad_hdr->class_version].class; if (!class) goto out; - if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >= + if (convert_mgmt_class(mad_hdr->mgmt_class) >= IB_MGMT_MAX_METHODS) goto out; method = class->method_table[convert_mgmt_class( - mad->mad_hdr.mgmt_class)]; + mad_hdr->mgmt_class)]; if (method) - mad_agent = method->agent[mad->mad_hdr.method & + mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } else { vendor = port_priv->version[ - mad->mad_hdr.class_version].vendor; + mad_hdr->class_version].vendor; if (!vendor) goto out; vendor_class = vendor->vendor_class[vendor_class_index( - mad->mad_hdr.mgmt_class)]; + mad_hdr->mgmt_class)]; if (!vendor_class) goto out; /* Find matching OUI */ - vendor_mad = (struct ib_vendor_mad *)mad; + vendor_mad = (const struct ib_vendor_mad *)mad_hdr; index = find_vendor_oui(vendor_class, vendor_mad->oui); if (index == -1) goto out; method = vendor_class->method_table[index]; if (method) { - mad_agent = method->agent[mad->mad_hdr.method & + mad_agent = method->agent[mad_hdr->method & ~IB_MGMT_METHOD_RESP]; } } @@ -2215,9 +1782,9 @@ if (mad_agent->agent.recv_handler) atomic_inc(&mad_agent->refcount); else { - printk(KERN_NOTICE PFX "No receive handler for client " - "%p on port %d\n", - &mad_agent->agent, port_priv->port_num); + dev_notice(&port_priv->device->dev, + "No receive handler for client %p on port %d\n", + &mad_agent->agent, port_priv->port_num); mad_agent = NULL; } } @@ -2227,23 +1794,32 @@ return mad_agent; } -static int validate_mad(struct ib_mad *mad, u32 qp_num) +static int validate_mad(const struct ib_mad_hdr *mad_hdr, + const struct ib_mad_qp_info *qp_info, + bool opa) { int valid = 0; + u32 qp_num = qp_info->qp->qp_num; /* Make sure MAD base version is understood */ - if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { - printk(KERN_ERR PFX "MAD received with unsupported base " - "version %d\n", mad->mad_hdr.base_version); + if (mad_hdr->base_version != IB_MGMT_BASE_VERSION && + (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) { + pr_err("MAD received with unsupported base version %d %s\n", + mad_hdr->base_version, opa ? "(opa)" : ""); goto out; } /* Filter SMI packets sent to other than QP0 */ - if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || - (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { if (qp_num == 0) valid = 1; } else { + /* CM attributes other than ClassPortInfo only use Send method */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && + (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && + (mad_hdr->method != IB_MGMT_METHOD_SEND)) + goto out; /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; @@ -2253,28 +1829,29 @@ return valid; } -static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_hdr *mad_hdr) +static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_hdr *mad_hdr) { - struct ib_rmpp_mad *rmpp_mad; + const struct ib_rmpp_mad *rmpp_mad; - rmpp_mad = (struct ib_rmpp_mad *)mad_hdr; + rmpp_mad = (const struct ib_rmpp_mad *)mad_hdr; return !mad_agent_priv->agent.rmpp_version || + !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) || !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE) || (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); } -static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr, - struct ib_mad_recv_wc *rwc) +static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc) { - return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class == + return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class == rwc->recv_buf.mad->mad_hdr.mgmt_class; } -static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_send_wr_private *wr, - struct ib_mad_recv_wc *rwc ) +static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc ) { struct ib_ah_attr attr; u8 send_resp, rcv_resp; @@ -2283,8 +1860,8 @@ u8 port_num = mad_agent_priv->agent.port_num; u8 lmc; - send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad); - rcv_resp = ib_response_mad(rwc->recv_buf.mad); + send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad); + rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr); if (send_resp == rcv_resp) /* both requests, or both responses. GIDs different */ @@ -2309,7 +1886,7 @@ ((1 << lmc) - 1))); } else { if (ib_get_cached_gid(device, port_num, - attr.grh.sgid_index, &sgid)) + attr.grh.sgid_index, &sgid, NULL)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); @@ -2329,22 +1906,22 @@ } struct ib_mad_send_wr_private* -ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_recv_wc *wc) +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *wc) { struct ib_mad_send_wr_private *wr; - struct ib_mad *mad; + const struct ib_mad_hdr *mad_hdr; - mad = (struct ib_mad *)wc->recv_buf.mad; + mad_hdr = &wc->recv_buf.mad->mad_hdr; list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { - if ((wr->tid == mad->mad_hdr.tid) && + if ((wr->tid == mad_hdr->tid) && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ - (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } @@ -2354,15 +1931,15 @@ * been notified that the send has completed */ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { - if (is_data_mad(mad_agent_priv, wr->send_buf.mad) && - wr->tid == mad->mad_hdr.tid && + if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) && + wr->tid == mad_hdr->tid && wr->timeout && rcv_has_same_class(wr, wc) && /* * Don't check GID for direct routed MADs. * These might have permissive LIDs. */ - (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ return (wr->status == IB_WC_SUCCESS) ? wr : NULL; @@ -2387,7 +1964,7 @@ INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, mad_recv_wc); if (!mad_recv_wc) { @@ -2397,168 +1974,321 @@ } /* Complete corresponding request */ - if (ib_response_mad(mad_recv_wc->recv_buf.mad)) { + if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) { spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - ib_free_recv_mad(mad_recv_wc); - deref_mad_agent(mad_agent_priv); - return; - } - ib_mark_mad_done(mad_send_wr); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) + && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) + & IB_MGMT_RMPP_FLAG_ACTIVE)) { + /* user rmpp is in effect + * and this is an active RMPP MAD + */ + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, NULL, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); + } else { + /* not user rmpp, revert to normal behavior and + * drop the mad */ + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + return; + } + } else { + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - /* Defined behavior is to complete response before request */ - mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); + /* Defined behavior is to complete response before request */ + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, + &mad_send_wr->send_buf, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); - mad_send_wc.status = IB_WC_SUCCESS; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + } } else { - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, mad_recv_wc); deref_mad_agent(mad_agent_priv); } } -static bool generate_unmatched_resp(struct ib_mad_private *recv, - struct ib_mad_private *response) +static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, + const struct ib_mad_qp_info *qp_info, + const struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) { - if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET || - recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) { - memcpy(response, recv, sizeof *response); + enum smi_forward_action retsmi; + struct ib_smp *smp = (struct ib_smp *)recv->mad; + + if (smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); response->header.recv_wc.wc = &response->header.wc; - response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; response->header.recv_wc.recv_buf.grh = &response->grh; - response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; - response->mad.mad.mad_hdr.status = - cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); - if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) - response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION; + + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + smi_get_fwd_port(smp), + qp_info->qp->qp_num, + response->mad_size, + false); + + return IB_SMI_DISCARD; + } + return IB_SMI_HANDLE; +} + +static bool generate_unmatched_resp(const struct ib_mad_private *recv, + struct ib_mad_private *response, + size_t *resp_len, bool opa) +{ + const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad; + struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad; + + if (recv_hdr->method == IB_MGMT_METHOD_GET || + recv_hdr->method == IB_MGMT_METHOD_SET) { + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + resp_hdr->method = IB_MGMT_METHOD_GET_RESP; + resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + resp_hdr->status |= IB_SMP_DIRECTION; + + if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) { + if (recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_LID_ROUTED || + recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + *resp_len = opa_get_smp_header_size( + (const struct opa_smp *)recv->mad); + else + *resp_len = sizeof(struct ib_mad_hdr); + } return true; } else { return false; } } -static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) + +static enum smi_action +handle_opa_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + enum smi_forward_action retsmi; + struct opa_smp *smp = (struct opa_smp *)recv->mad; + + if (opa_smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = opa_smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (opa_smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (opa_smi_check_local_smp(smp, port_priv->device) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.opa_mad = + (struct opa_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + opa_smi_get_fwd_port(smp), + qp_info->qp->qp_num, + recv->header.wc.byte_len, + true); + + return IB_SMI_DISCARD; + } + + return IB_SMI_HANDLE; +} + +static enum smi_action +handle_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + int port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response, + bool opa) +{ + struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad; + + if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION && + mad_hdr->class_version == OPA_SMI_CLASS_VERSION) + return handle_opa_smi(port_priv, qp_info, wc, port_num, recv, + response); + + return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); +} + +static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_qp_info *qp_info; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; - struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; + size_t mad_size; + u16 resp_mad_pkey_index = 0; + bool opa; + + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + /* + * Receive errors indicate that the QP has entered the error + * state - error handling/shutdown code will cleanup + */ + return; + } - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); + opa = rdma_cap_opa_mad(qp_info->port_priv->device, + qp_info->port_priv->port_num); + mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, mad_list); recv = container_of(mad_priv_hdr, struct ib_mad_private, header); ib_dma_unmap_single(port_priv->device, recv->header.mapping, - sizeof(struct ib_mad_private) - - sizeof(struct ib_mad_private_header), + mad_priv_dma_size(recv), DMA_FROM_DEVICE); /* Setup MAD receive work completion from "normal" work completion */ recv->header.wc = *wc; recv->header.recv_wc.wc = &recv->header.wc; - recv->header.recv_wc.mad_len = sizeof(struct ib_mad); - recv->header.recv_wc.recv_buf.mad = &recv->mad.mad; + + if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) { + recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh); + recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + recv->header.recv_wc.mad_len = sizeof(struct ib_mad); + recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + + recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; if (atomic_read(&qp_info->snoop_count)) snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); /* Validate MAD */ - if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num)) + if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; - response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + mad_size = recv->mad_size; + response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) { - printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory " - "for response buffer\n"); + dev_err(&port_priv->device->dev, + "%s: no memory for response buffer\n", __func__); goto out; } - if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) + if (rdma_cap_ib_switch(port_priv->device)) port_num = wc->port_num; else port_num = port_priv->port_num; - if (recv->mad.mad.mad_hdr.mgmt_class == + if (((struct ib_mad_hdr *)recv->mad)->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - enum smi_forward_action retsmi; - - if (smi_handle_dr_smp_recv(&recv->mad.smp, - port_priv->device->node_type, - port_num, - port_priv->device->phys_port_cnt) == - IB_SMI_DISCARD) + if (handle_smi(port_priv, qp_info, wc, port_num, recv, + response, opa) + == IB_SMI_DISCARD) goto out; - - retsmi = smi_check_forward_dr_smp(&recv->mad.smp); - if (retsmi == IB_SMI_LOCAL) - goto local; - - if (retsmi == IB_SMI_SEND) { /* don't forward */ - if (smi_handle_dr_smp_send(&recv->mad.smp, - port_priv->device->node_type, - port_num) == IB_SMI_DISCARD) - goto out; - - if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD) - goto out; - } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { - /* forward case for switches */ - memcpy(response, recv, sizeof(*response)); - response->header.recv_wc.wc = &response->header.wc; - response->header.recv_wc.recv_buf.mad = &response->mad.mad; - response->header.recv_wc.recv_buf.grh = &response->grh; - - agent_send_response(&response->mad.mad, - &response->grh, wc, - port_priv->device, - smi_get_fwd_port(&recv->mad.smp), - qp_info->qp->qp_num); - - goto out; - } } -local: /* Give driver "right of first refusal" on incoming MAD */ if (port_priv->device->process_mad) { ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, wc, &recv->grh, - &recv->mad.mad, - &response->mad.mad); + (const struct ib_mad_hdr *)recv->mad, + recv->mad_size, + (struct ib_mad_hdr *)response->mad, + &mad_size, &resp_mad_pkey_index); + + if (opa) + wc->pkey_index = resp_mad_pkey_index; + if (ret & IB_MAD_RESULT_SUCCESS) { if (ret & IB_MAD_RESULT_CONSUMED) goto out; if (ret & IB_MAD_RESULT_REPLY) { - agent_send_response(&response->mad.mad, + agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, port_priv->device, port_num, - qp_info->qp->qp_num); + qp_info->qp->qp_num, + mad_size, opa); goto out; } } } - mad_agent = find_mad_agent(port_priv, &recv->mad.mad); + mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); if (mad_agent) { ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); /* @@ -2567,17 +2297,17 @@ */ recv = NULL; } else if ((ret & IB_MAD_RESULT_SUCCESS) && - generate_unmatched_resp(recv, response)) { - agent_send_response(&response->mad.mad, &recv->grh, wc, - port_priv->device, port_num, qp_info->qp->qp_num); + generate_unmatched_resp(recv, response, &mad_size, opa)) { + agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, + port_priv->device, port_num, + qp_info->qp->qp_num, mad_size, opa); } out: /* Post another receive request for this QP */ if (response) { ib_mad_post_receive_mads(qp_info, response); - if (recv) - kmem_cache_free(ib_mad_cache, recv); + kfree(recv); } else ib_mad_post_receive_mads(qp_info, recv); } @@ -2658,7 +2388,7 @@ mad_agent_priv = mad_send_wr->mad_agent_priv; spin_lock_irqsave(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); if (ret == IB_RMPP_RESULT_CONSUMED) goto done; @@ -2688,12 +2418,9 @@ mad_send_wc->status = mad_send_wr->status; if (ret == IB_RMPP_RESULT_INTERNAL) ib_rmpp_send_handler(mad_send_wc); - else { - if (mad_send_wr->is_sa_cc_mad) - sa_cc_mad_done(get_cc_obj(mad_send_wr)); + else mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); - } /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); @@ -2702,11 +2429,12 @@ spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; - struct ib_mad_list_head *mad_list; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; struct ib_send_wr *bad_send_wr; @@ -2714,7 +2442,14 @@ unsigned long flags; int ret; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + if (!ib_mad_send_error(port_priv, wc)) + return; + } + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); send_queue = mad_list->mad_queue; @@ -2751,10 +2486,11 @@ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { - ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, + ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, &bad_send_wr); if (ret) { - printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret); + dev_err(&port_priv->device->dev, + "ib_post_send failed: %d\n", ret); mad_send_wr = queued_send_wr; wc->status = IB_WC_LOC_QP_OP_ERR; goto retry; @@ -2778,24 +2514,15 @@ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); } -static void mad_error_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) { - struct ib_mad_list_head *mad_list; - struct ib_mad_qp_info *qp_info; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info; struct ib_mad_send_wr_private *mad_send_wr; int ret; - /* Determine if failure was a send or receive */ - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; - qp_info = mad_list->mad_queue->qp_info; - if (mad_list->mad_queue == &qp_info->recv_queue) - /* - * Receive errors indicate that the QP has entered the error - * state - error handling/shutdown code will cleanup - */ - return; - /* * Send errors will transition the QP to SQE - move * QP to RTS and repost flushed work requests @@ -2808,12 +2535,11 @@ struct ib_send_wr *bad_send_wr; mad_send_wr->retry = 0; - ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr, + ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); - if (ret) - ib_mad_send_done_handler(port_priv, wc); - } else - ib_mad_send_done_handler(port_priv, wc); + if (!ret) + return false; + } } else { struct ib_qp_attr *attr; @@ -2826,42 +2552,15 @@ IB_QP_STATE | IB_QP_CUR_STATE); kfree(attr); if (ret) - printk(KERN_ERR PFX "mad_error_handler - " - "ib_modify_qp to RTS : %d\n", ret); + dev_err(&port_priv->device->dev, + "%s - ib_modify_qp to RTS: %d\n", + __func__, ret); else mark_sends_for_retry(qp_info); } - ib_mad_send_done_handler(port_priv, wc); } -} - -/* - * IB MAD completion callback - */ -static void ib_mad_completion_handler(struct work_struct *work) -{ - struct ib_mad_port_private *port_priv; - struct ib_wc wc; - port_priv = container_of(work, struct ib_mad_port_private, work); - ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); - - while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { - if (wc.status == IB_WC_SUCCESS) { - switch (wc.opcode) { - case IB_WC_SEND: - ib_mad_send_done_handler(port_priv, &wc); - break; - case IB_WC_RECV: - ib_mad_recv_done_handler(port_priv, &wc); - break; - default: - BUG_ON(1); - break; - } - } else - mad_error_handler(port_priv, &wc); - } + return true; } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) @@ -2873,7 +2572,6 @@ INIT_LIST_HEAD(&cancel_list); - cancel_sa_cc_mads(mad_agent_priv); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &mad_agent_priv->send_list, agent_list) { @@ -2895,8 +2593,6 @@ &cancel_list, agent_list) { mad_send_wc.send_buf = &mad_send_wr->send_buf; list_del(&mad_send_wr->agent_list); - if (mad_send_wr->is_sa_cc_mad) - sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); atomic_dec(&mad_agent_priv->refcount); @@ -2917,7 +2613,8 @@ list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, agent_list) { - if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) && + if (is_rmpp_data_mad(mad_agent_priv, + mad_send_wr->send_buf.mad) && &mad_send_wr->send_buf == send_buf) return mad_send_wr; } @@ -2936,13 +2633,7 @@ agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); - if (!mad_send_wr) { - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (modify_sa_cc_mad(mad_agent_priv, send_buf, timeout_ms)) - return -EINVAL; - return 0; - } - if (mad_send_wr->status != IB_WC_SUCCESS) { + if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } @@ -2980,10 +2671,14 @@ int free_mad; struct ib_wc wc; struct ib_mad_send_wc mad_send_wc; + bool opa; mad_agent_priv = container_of(work, struct ib_mad_agent_private, local_work); + opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); + spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->local_list)) { local = list_entry(mad_agent_priv->local_list.next, @@ -2993,9 +2688,11 @@ spin_unlock_irqrestore(&mad_agent_priv->lock, flags); free_mad = 0; if (local->mad_priv) { + u8 base_version; recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { - printk(KERN_ERR PFX "No receive MAD agent for local completion\n"); + dev_err(&mad_agent_priv->agent.device->dev, + "No receive MAD agent for local completion\n"); free_mad = 1; goto local_send_completion; } @@ -3005,25 +2702,35 @@ * before request */ build_smp_wc(recv_mad_agent->agent.qp, - (unsigned long) local->mad_send_wr, + local->mad_send_wr->send_wr.wr.wr_cqe, be16_to_cpu(IB_LID_PERMISSIVE), - 0, recv_mad_agent->agent.port_num, &wc); + local->mad_send_wr->send_wr.pkey_index, + recv_mad_agent->agent.port_num, &wc); local->mad_priv->header.recv_wc.wc = &wc; - local->mad_priv->header.recv_wc.mad_len = - sizeof(struct ib_mad); + + base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version; + if (opa && base_version == OPA_MGMT_BASE_VERSION) { + local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len; + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad); + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list); list_add(&local->mad_priv->header.recv_wc.recv_buf.list, &local->mad_priv->header.recv_wc.rmpp_list); local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = - &local->mad_priv->mad.mad; + (struct ib_mad *)local->mad_priv->mad; if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) snoop_recv(recv_mad_agent->qp_info, &local->mad_priv->header.recv_wc, IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, + &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); atomic_dec(&recv_mad_agent->refcount); @@ -3045,7 +2752,7 @@ spin_lock_irqsave(&mad_agent_priv->lock, flags); atomic_dec(&mad_agent_priv->refcount); if (free_mad) - kmem_cache_free(ib_mad_cache, local->mad_priv); + kfree(local->mad_priv); kfree(local); } spin_unlock_irqrestore(&mad_agent_priv->lock, flags); @@ -3063,7 +2770,7 @@ mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); - if (mad_send_wr->mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); switch (ret) { case IB_RMPP_RESULT_UNHANDLED: @@ -3126,8 +2833,6 @@ else mad_send_wc.status = mad_send_wr->status; mad_send_wc.send_buf = &mad_send_wr->send_buf; - if (mad_send_wr->is_sa_cc_mad) - sa_cc_mad_done(get_cc_obj(mad_send_wr)); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); @@ -3137,17 +2842,6 @@ spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg) -{ - struct ib_mad_port_private *port_priv = cq->cq_context; - unsigned long flags; - - spin_lock_irqsave(&ib_mad_port_list_lock, flags); - if (!list_empty(&port_priv->port_list)) - queue_work(port_priv->wq, &port_priv->work); - spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); -} - /* * Allocate receive MADs and post receive WRs for them */ @@ -3162,8 +2856,7 @@ struct ib_mad_queue *recv_queue = &qp_info->recv_queue; /* Initialize common scatter list fields */ - sg_list.length = sizeof *mad_priv - sizeof mad_priv->header; - sg_list.lkey = (*qp_info->port_priv->mr).lkey; + sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; /* Initialize common receive WR fields */ recv_wr.next = NULL; @@ -3176,29 +2869,29 @@ mad_priv = mad; mad = NULL; } else { - mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); + mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), + GFP_ATOMIC); if (!mad_priv) { - printk(KERN_ERR PFX "No memory for receive buffer\n"); + dev_err(&qp_info->port_priv->device->dev, + "No memory for receive buffer\n"); ret = -ENOMEM; break; } } + sg_list.length = mad_priv_dma_size(mad_priv); sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, &mad_priv->grh, - sizeof *mad_priv - - sizeof mad_priv->header, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { ret = -ENOMEM; - kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_dma_map_single failed\n"); break; } - mad_priv->header.mapping = sg_list.addr; - recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; + mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; + recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); @@ -3213,11 +2906,11 @@ spin_unlock_irqrestore(&recv_queue->lock, flags); ib_dma_unmap_single(qp_info->port_priv->device, mad_priv->header.mapping, - sizeof *mad_priv - - sizeof mad_priv->header, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); - kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); + kfree(mad_priv); + dev_err(&qp_info->port_priv->device->dev, + "ib_post_recv failed: %d\n", ret); break; } } while (post); @@ -3252,10 +2945,9 @@ ib_dma_unmap_single(qp_info->port_priv->device, recv->header.mapping, - sizeof(struct ib_mad_private) - - sizeof(struct ib_mad_private_header), + mad_priv_dma_size(recv), DMA_FROM_DEVICE); - kmem_cache_free(ib_mad_cache, recv); + kfree(recv); } qp_info->recv_queue.count = 0; @@ -3269,16 +2961,17 @@ int ret, i; struct ib_qp_attr *attr; struct ib_qp *qp; - u16 pkey_index = 0; + u16 pkey_index; attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { - printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n"); + dev_err(&port_priv->device->dev, + "Couldn't kmalloc ib_qp_attr\n"); return -ENOMEM; } ret = ib_find_pkey(port_priv->device, port_priv->port_num, - 0xFFFF, &pkey_index); + IB_DEFAULT_PKEY_FULL, &pkey_index); if (ret) pkey_index = 0; @@ -3297,16 +2990,18 @@ ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "INIT: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to INIT: %d\n", + i, ret); goto out; } attr->qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, attr, IB_QP_STATE); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTR: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTR: %d\n", + i, ret); goto out; } @@ -3314,16 +3009,18 @@ attr->sq_psn = IB_MAD_SEND_Q_PSN; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTS: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTS: %d\n", + i, ret); goto out; } } ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); if (ret) { - printk(KERN_ERR PFX "Failed to request completion " - "notification: %d\n", ret); + dev_err(&port_priv->device->dev, + "Failed to request completion notification: %d\n", + ret); goto out; } @@ -3333,7 +3030,8 @@ ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { - printk(KERN_ERR PFX "Couldn't post receive WRs\n"); + dev_err(&port_priv->device->dev, + "Couldn't post receive WRs\n"); goto out; } } @@ -3347,7 +3045,8 @@ struct ib_mad_qp_info *qp_info = qp_context; /* It's worse than that! He's dead, Jim! */ - printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n", + dev_err(&qp_info->port_priv->device->dev, + "Fatal error (%d) on MAD QP (%d)\n", event->event, qp_info->qp->qp_num); } @@ -3393,8 +3092,9 @@ qp_init_attr.event_handler = qp_event_handler; qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); if (IS_ERR(qp_info->qp)) { - printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n", - get_spl_qp_index(qp_type)); + dev_err(&qp_info->port_priv->device->dev, + "Couldn't create ib_mad QP%d\n", + get_spl_qp_index(qp_type)); ret = PTR_ERR(qp_info->qp); goto error; } @@ -3429,10 +3129,17 @@ char name[sizeof "ib_mad123"]; int has_smi; + if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) + return -EFAULT; + + if (WARN_ON(rdma_cap_opa_mad(device, port_num) && + rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE)) + return -EFAULT; + /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR PFX "No memory for ib_mad_port_private\n"); + dev_err(&device->dev, "No memory for ib_mad_port_private\n"); return -ENOMEM; } @@ -3444,33 +3151,25 @@ init_mad_qp(port_priv, &port_priv->qp_info[1]); cq_size = mad_sendq_size + mad_recvq_size; - has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; + has_smi = rdma_cap_ib_smi(device, port_num); if (has_smi) cq_size *= 2; - port_priv->cq = ib_create_cq(port_priv->device, - ib_mad_thread_completion_handler, - NULL, port_priv, cq_size, 0); + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, + IB_POLL_WORKQUEUE); if (IS_ERR(port_priv->cq)) { - printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n"); + dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); goto error3; } - port_priv->pd = ib_alloc_pd(device); + port_priv->pd = ib_alloc_pd(device, 0); if (IS_ERR(port_priv->pd)) { - printk(KERN_ERR PFX "Couldn't create ib_mad PD\n"); + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); goto error4; } - port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(port_priv->mr)) { - printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n"); - ret = PTR_ERR(port_priv->mr); - goto error5; - } - if (has_smi) { ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); if (ret) @@ -3481,16 +3180,11 @@ goto error7; snprintf(name, sizeof name, "ib_mad%d", port_num); - port_priv->wq = create_singlethread_workqueue(name); + port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); if (!port_priv->wq) { ret = -ENOMEM; goto error8; } - INIT_WORK(&port_priv->work, ib_mad_completion_handler); - - if (sa_cc_init(&port_priv->sa_cc)) - goto error9; - spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); @@ -3498,30 +3192,26 @@ ret = ib_mad_port_start(port_priv); if (ret) { - printk(KERN_ERR PFX "Couldn't start port\n"); - goto error10; + dev_err(&device->dev, "Couldn't start port\n"); + goto error9; } return 0; -error10: +error9: spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); -error9: - sa_cc_destroy(&port_priv->sa_cc); error8: destroy_mad_qp(&port_priv->qp_info[1]); error7: destroy_mad_qp(&port_priv->qp_info[0]); error6: - ib_dereg_mr(port_priv->mr); -error5: ib_dealloc_pd(port_priv->pd); error4: - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); error3: @@ -3544,19 +3234,17 @@ port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - printk(KERN_ERR PFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); destroy_workqueue(port_priv->wq); - sa_cc_destroy(&port_priv->sa_cc); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); - ib_dereg_mr(port_priv->mr); ib_dealloc_pd(port_priv->pd); - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ @@ -3568,29 +3256,21 @@ static void ib_mad_init_device(struct ib_device *device) { - int start, end, i; + int start, i; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + start = rdma_start_port(device); - if (device->node_type == RDMA_NODE_IB_SWITCH) { - start = 0; - end = 0; - } else { - start = 1; - end = device->phys_port_cnt; - } + for (i = start; i <= rdma_end_port(device); i++) { + if (!rdma_cap_ib_mad(device, i)) + continue; - for (i = start; i <= end; i++) { if (ib_mad_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } if (ib_agent_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't open port %d for agents\n", i); goto error_agent; } } @@ -3598,46 +3278,34 @@ error_agent: if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); error: - i--; + while (--i >= start) { + if (!rdma_cap_ib_mad(device, i)) + continue; - while (i >= start) { if (ib_agent_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); - i--; + dev_err(&device->dev, "Couldn't close port %d\n", i); } } -static void ib_mad_remove_device(struct ib_device *device) +static void ib_mad_remove_device(struct ib_device *device, void *client_data) { - int i, num_ports, cur_port; + int i; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + if (!rdma_cap_ib_mad(device, i)) + continue; - if (device->node_type == RDMA_NODE_IB_SWITCH) { - num_ports = 1; - cur_port = 0; - } else { - num_ports = device->phys_port_cnt; - cur_port = 1; - } - for (i = 0; i < num_ports; i++, cur_port++) { - if (ib_agent_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, cur_port); - if (ib_mad_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, cur_port); + if (ib_agent_port_close(device, i)) + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %d\n", i); } } @@ -3647,48 +3315,25 @@ .remove = ib_mad_remove_device }; -static int __init ib_mad_init_module(void) +int ib_mad_init(void) { - int ret; - mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); - ib_mad_cache = kmem_cache_create("ib_mad", - sizeof(struct ib_mad_private), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!ib_mad_cache) { - printk(KERN_ERR PFX "Couldn't create ib_mad cache\n"); - ret = -ENOMEM; - goto error1; - } - INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { - printk(KERN_ERR PFX "Couldn't register ib_mad client\n"); - ret = -EINVAL; - goto error2; + pr_err("Couldn't register ib_mad client\n"); + return -EINVAL; } return 0; - -error2: - kmem_cache_destroy(ib_mad_cache); -error1: - return ret; } -static void __exit ib_mad_cleanup_module(void) +void ib_mad_cleanup(void) { ib_unregister_client(&mad_client); - kmem_cache_destroy(ib_mad_cache); } - -module_init(ib_mad_init_module); -module_exit(ib_mad_cleanup_module); Index: sys/ofed/drivers/infiniband/core/ib_mad_rmpp.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_mad_rmpp.c +++ sys/ofed/drivers/infiniband/core/ib_mad_rmpp.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2005 Intel Inc. All rights reserved. * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -67,6 +68,7 @@ u8 mgmt_class; u8 class_version; u8 method; + u8 base_version; }; static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) @@ -139,7 +141,8 @@ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, hdr_len, - 0, GFP_KERNEL); + 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) return; @@ -165,7 +168,8 @@ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, - hdr_len, 0, GFP_KERNEL); + hdr_len, 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) ib_destroy_ah(ah); else { @@ -316,6 +320,7 @@ rmpp_recv->mgmt_class = mad_hdr->mgmt_class; rmpp_recv->class_version = mad_hdr->class_version; rmpp_recv->method = mad_hdr->method; + rmpp_recv->base_version = mad_hdr->base_version; return rmpp_recv; error: kfree(rmpp_recv); @@ -431,14 +436,23 @@ { struct ib_rmpp_mad *rmpp_mad; int hdr_size, data_size, pad; + bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device, + rmpp_recv->agent->qp_info->port_priv->port_num); rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad; hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); - data_size = sizeof(struct ib_rmpp_mad) - hdr_size; - pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); - if (pad > IB_MGMT_RMPP_DATA || pad < 0) - pad = 0; + if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) { + data_size = sizeof(struct opa_rmpp_mad) - hdr_size; + pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > OPA_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } else { + data_size = sizeof(struct ib_rmpp_mad) - hdr_size; + pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > IB_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } return hdr_size + rmpp_recv->seg_num * data_size - pad; } @@ -570,13 +584,14 @@ if (mad_send_wr->seg_num == 1) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST; - paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA - - mad_send_wr->pad; + paylen = (mad_send_wr->send_buf.seg_count * + mad_send_wr->send_buf.seg_rmpp_size) - + mad_send_wr->pad; } if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST; - paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad; + paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad; } rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); Index: sys/ofed/drivers/infiniband/core/ib_multicast.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_multicast.c +++ sys/ofed/drivers/infiniband/core/ib_multicast.c @@ -36,29 +36,16 @@ #include #include #include -#include #include #include #include -#include #include #include #include "sa.h" -static int mcast_leave_retries = 3; - -/*static const struct kernel_param_ops retry_ops = { - .set = param_set_int, - .get = param_get_int, -}; - -module_param_cb(mcast_leave_retries, &retry_ops, &mcast_leave_retries, 0644); -MODULE_PARM_DESC(mcast_leave_retries, "Number of retries for multicast leave " - "requests before giving up (default: 3)"); -*/ static void mcast_add_one(struct ib_device *device); -static void mcast_remove_one(struct ib_device *device); +static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { .name = "ib_multicast", @@ -117,11 +104,10 @@ struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; - int members[3]; + int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; - int query_id; u16 pkey_index; u8 leave_state; int retries; @@ -235,8 +221,9 @@ } /* - * A multicast group has three types of members: full member, non member, and - * send only member. We need to keep track of the number of members of each + * A multicast group has four types of members: full member, non member, + * sendonly non member and sendonly full member. + * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ @@ -244,7 +231,7 @@ { int i; - for (i = 0; i < 3; i++, join_state >>= 1) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } @@ -260,7 +247,7 @@ u8 leave_state = 0; int i; - for (i = 0; i < 3; i++) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); @@ -308,8 +295,8 @@ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, - IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, - src->mtu, dst->mtu)) + IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, + src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) @@ -317,14 +304,14 @@ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, - IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, - src->rate, dst->rate)) + IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, + src->rate, dst->rate)) return -EINVAL; if (check_selector(comp_mask, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, - dst->packet_life_time_selector, - src->packet_life_time, dst->packet_life_time)) + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + dst->packet_life_time_selector, + src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; @@ -354,11 +341,7 @@ member->multicast.comp_mask, 3000, GFP_KERNEL, join_handler, group, &group->query); - if (ret >= 0) { - group->query_id = ret; - ret = 0; - } - return ret; + return (ret > 0) ? 0 : ret; } static int send_leave(struct mcast_group *group, u8 leave_state) @@ -378,11 +361,7 @@ IB_SA_MCMEMBER_REC_JOIN_STATE, 3000, GFP_KERNEL, leave_handler, group, &group->query); - if (ret >= 0) { - group->query_id = ret; - ret = 0; - } - return ret; + return (ret > 0) ? 0 : ret; } static void join_group(struct mcast_group *group, struct mcast_member *member, @@ -540,17 +519,22 @@ if (status) process_join_error(group, status); else { + int mgids_changed, is_mgid0; ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index); spin_lock_irq(&group->port->lock); - group->rec = *rec; if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; - if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { + mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, + sizeof(group->rec.mgid)); + group->rec = *rec; + if (mgids_changed) { rb_erase(&group->node, &group->port->table); - mcast_insert(group->port, group, 1); + is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, + sizeof(mgid0)); + mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } @@ -565,12 +549,8 @@ if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; - else { - if (status && group->retries <= 0) - printk(KERN_WARNING "reached max retry count. " - "status=%d. Giving up\n", status); + else mcast_work_handler(&group->work); - } } static struct mcast_group *acquire_group(struct mcast_port *port, @@ -593,7 +573,7 @@ if (!group) return NULL; - group->retries = mcast_leave_retries; + group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; @@ -737,13 +717,27 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index); + if (rdma_protocol_roce(device, port_num)) { + ret = ib_find_cached_gid_by_port(device, &rec->port_gid, + gid_type, port_num, + ndev, + &gid_index); + } else if (rdma_protocol_ib(device, port_num)) { + ret = ib_find_cached_gid(device, &rec->port_gid, + IB_GID_TYPE_IB, NULL, &p, + &gid_index); + } else { + ret = -EINVAL; + } + if (ret) return ret; @@ -794,8 +788,7 @@ int index; dev = container_of(handler, struct mcast_device, event_handler); - if (rdma_port_get_link_layer(dev->device, event->element.port_num) != - IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_mcast(dev->device, event->element.port_num)) return; index = event->element.port_num - dev->start_port; @@ -803,6 +796,7 @@ switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: + case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; @@ -821,24 +815,16 @@ int i; int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, GFP_KERNEL); if (!dev) return; - if (device->node_type == RDMA_NODE_IB_SWITCH) - dev->start_port = dev->end_port = 0; - else { - dev->start_port = 1; - dev->end_port = device->phys_port_cnt; - } + dev->start_port = rdma_start_port(device); + dev->end_port = rdma_end_port(device); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) != - IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_mcast(device, dev->start_port + i)) continue; port = &dev->port[i]; port->dev = dev; @@ -862,13 +848,12 @@ ib_register_event_handler(&dev->event_handler); } -static void mcast_remove_one(struct ib_device *device) +static void mcast_remove_one(struct ib_device *device, void *client_data) { - struct mcast_device *dev; + struct mcast_device *dev = client_data; struct mcast_port *port; int i; - dev = ib_get_client_data(device, &mcast_client); if (!dev) return; @@ -876,8 +861,7 @@ flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) == - IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_mcast(device, dev->start_port + i)) { port = &dev->port[i]; deref_port(port); wait_for_completion(&port->comp); @@ -891,7 +875,7 @@ { int ret; - mcast_wq = create_singlethread_workqueue("ib_mcast"); + mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM); if (!mcast_wq) return -ENOMEM; Index: sys/ofed/drivers/infiniband/core/ib_packer.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_packer.c +++ sys/ofed/drivers/infiniband/core/ib_packer.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include #include #include @@ -39,12 +38,12 @@ static u64 value_read(int offset, int size, void *structure) { switch (size) { - case 1: return *(u8 *) (structure + offset); - case 2: return be16_to_cpup((__be16 *) (structure + offset)); - case 4: return be32_to_cpup((__be32 *) (structure + offset)); - case 8: return be64_to_cpup((__be64 *) (structure + offset)); + case 1: return *(u8 *) ((char *)structure + offset); + case 2: return be16_to_cpup((__be16 *) ((char *)structure + offset)); + case 4: return be32_to_cpup((__be32 *) ((char *)structure + offset)); + case 8: return be64_to_cpup((__be64 *) ((char *)structure + offset)); default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); return 0; } } @@ -104,18 +103,17 @@ } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } if (desc[i].struct_size_bytes) - memcpy(buf + desc[i].offset_words * 4 + + memcpy((char *)buf + desc[i].offset_words * 4 + desc[i].offset_bits / 8, - structure + desc[i].struct_offset_bytes, + (char *)structure + desc[i].struct_offset_bytes, desc[i].size_bits / 8); else - memset(buf + desc[i].offset_words * 4 + + memset((char *)buf + desc[i].offset_words * 4 + desc[i].offset_bits / 8, 0, desc[i].size_bits / 8); @@ -127,12 +125,12 @@ static void value_write(int offset, int size, u64 val, void *structure) { switch (size * 8) { - case 8: *( u8 *) (structure + offset) = val; break; - case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break; - case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break; - case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break; + case 8: *( u8 *) ((char *)structure + offset) = val; break; + case 16: *(__be16 *) ((char *)structure + offset) = cpu_to_be16(val); break; + case 32: *(__be32 *) ((char *)structure + offset) = cpu_to_be32(val); break; + case 64: *(__be64 *) ((char *)structure + offset) = cpu_to_be64(val); break; default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); } } @@ -188,13 +186,12 @@ } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } - memcpy(structure + desc[i].struct_offset_bytes, - buf + desc[i].offset_words * 4 + + memcpy((char *)structure + desc[i].struct_offset_bytes, + (char *)buf + desc[i].offset_words * 4 + desc[i].offset_bits / 8, desc[i].size_bits / 8); } Index: sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2015-2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include +#include + +#include +#include + +#include + +static struct workqueue_struct *roce_gid_mgmt_wq; + +enum gid_op_type { + GID_DEL = 0, + GID_ADD +}; + +struct roce_gid_scan_event_work { + struct work_struct work; + struct net_device *ndev; +}; + +struct roce_rescan_work { + struct work_struct work; + struct ib_device *ib_dev; +}; + +static const struct { + bool (*is_supported)(const struct ib_device *device, u8 port_num); + enum ib_gid_type gid_type; +} PORT_CAP_TO_GID_TYPE[] = { + {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, + {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, +}; + +#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) + +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) +{ + int i; + unsigned int ret_flags = 0; + + if (!rdma_protocol_roce(ib_dev, port)) + return 1UL << IB_GID_TYPE_IB; + + for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) + if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) + ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; + + return ret_flags; +} +EXPORT_SYMBOL(roce_gid_type_mask_support); + +static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, + u8 port, union ib_gid *gid, struct net_device *ndev) +{ + int i; + unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + struct ib_gid_attr gid_attr; + + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + for (i = 0; i != IB_GID_TYPE_SIZE; i++) { + if ((1UL << i) & gid_type_mask) { + gid_attr.gid_type = i; + switch (gid_op) { + case GID_ADD: + ib_cache_gid_add(ib_dev, port, + gid, &gid_attr); + break; + case GID_DEL: + ib_cache_gid_del(ib_dev, port, + gid, &gid_attr); + break; + } + } + } +} + +static int +roce_gid_match_netdev(struct ib_device *ib_dev, u8 port, + struct net_device *idev, void *cookie) +{ + struct net_device *ndev = (struct net_device *)cookie; + if (idev == NULL) + return (0); + return (ndev == idev); +} + +static int +roce_gid_match_all(struct ib_device *ib_dev, u8 port, + struct net_device *idev, void *cookie) +{ + if (idev == NULL) + return (0); + return (1); +} + +static int +roce_gid_enum_netdev_default(struct ib_device *ib_dev, + u8 port, struct net_device *idev) +{ + unsigned long gid_type_mask; + + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, idev, gid_type_mask, + IB_CACHE_GID_DEFAULT_MODE_SET); + + return (hweight_long(gid_type_mask)); +} + +#define ETH_IPOIB_DRV_NAME "ib" + +static inline int +is_eth_ipoib_intf(struct net_device *dev) +{ + if (strcmp(dev->if_dname, ETH_IPOIB_DRV_NAME)) + return 0; + return 1; +} + +static void +roce_gid_update_addr_callback(struct ib_device *device, u8 port, + struct net_device *ndev, void *cookie) +{ + struct ipx_entry { + STAILQ_ENTRY(ipx_entry) entry; + union ipx_addr { + struct sockaddr sa[0]; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } ipx_addr; + }; + struct ipx_entry *entry; + struct net_device *idev; + struct ifaddr *ifa; + union ib_gid gid; + int default_gids; + u16 index_num; + int i; + + STAILQ_HEAD(, ipx_entry) ipx_head; + + STAILQ_INIT(&ipx_head); + + /* make sure default GIDs are in */ + default_gids = roce_gid_enum_netdev_default(device, port, ndev); + + CURVNET_SET(ndev->if_vnet); + IFNET_RLOCK(); + TAILQ_FOREACH(idev, &V_ifnet, if_link) { + if (idev != ndev) { + if (idev->if_type != IFT_L2VLAN) + continue; + if (ndev != rdma_vlan_dev_real_dev(idev)) + continue; + } + + /* clone address information for IPv4 and IPv6 */ + IF_ADDR_RLOCK(idev); +#if defined(INET) + TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET) + continue; + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + pr_warn("roce_gid_update_addr_callback: " + "couldn't allocate entry for IPv4 update\n"); + continue; + } + entry->ipx_addr.v4 = *((struct sockaddr_in *)ifa->ifa_addr); + STAILQ_INSERT_TAIL(&ipx_head, entry, entry); + } +#endif +#if defined(INET6) + TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET6) + continue; + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + pr_warn("roce_gid_update_addr_callback: " + "couldn't allocate entry for IPv6 update\n"); + continue; + } + entry->ipx_addr.v6 = *((struct sockaddr_in6 *)ifa->ifa_addr); + + /* trash IPv6 scope ID */ + sa6_recoverscope(&entry->ipx_addr.v6); + entry->ipx_addr.v6.sin6_scope_id = 0; + + STAILQ_INSERT_TAIL(&ipx_head, entry, entry); + } +#endif + IF_ADDR_RUNLOCK(idev); + } + IFNET_RUNLOCK(); + CURVNET_RESTORE(); + + /* add missing GIDs, if any */ + STAILQ_FOREACH(entry, &ipx_head, entry) { + unsigned long gid_type_mask = roce_gid_type_mask_support(device, port); + + if (rdma_ip2gid(&entry->ipx_addr.sa[0], &gid) != 0) + continue; + + for (i = 0; i != IB_GID_TYPE_SIZE; i++) { + if (!((1UL << i) & gid_type_mask)) + continue; + /* check if entry found */ + if (ib_find_cached_gid_by_port(device, &gid, i, + port, ndev, &index_num) == 0) + break; + } + if (i != IB_GID_TYPE_SIZE) + continue; + /* add new GID */ + update_gid(GID_ADD, device, port, &gid, ndev); + } + + /* remove stale GIDs, if any */ + for (i = default_gids; ib_get_cached_gid(device, port, i, &gid, NULL) == 0; i++) { + union ipx_addr ipx; + + /* don't delete empty entries */ + if (memcmp(&gid, &zgid, sizeof(zgid)) == 0) + continue; + + /* zero default */ + memset(&ipx, 0, sizeof(ipx)); + + rdma_gid2ip(&ipx.sa[0], &gid); + + STAILQ_FOREACH(entry, &ipx_head, entry) { + if (memcmp(&entry->ipx_addr, &ipx, sizeof(ipx)) == 0) + break; + } + /* check if entry found */ + if (entry != NULL) + continue; + + /* remove GID */ + update_gid(GID_DEL, device, port, &gid, ndev); + } + + while ((entry = STAILQ_FIRST(&ipx_head))) { + STAILQ_REMOVE_HEAD(&ipx_head, entry); + kfree(entry); + } +} + +static void +roce_gid_queue_scan_event_handler(struct work_struct *_work) +{ + struct roce_gid_scan_event_work *work = + container_of(_work, struct roce_gid_scan_event_work, work); + + ib_enum_all_roce_netdevs(roce_gid_match_netdev, work->ndev, + roce_gid_update_addr_callback, NULL); + + dev_put(work->ndev); + kfree(work); +} + +static void +roce_gid_queue_scan_event(struct net_device *ndev) +{ + struct roce_gid_scan_event_work *work; + +retry: + if (is_eth_ipoib_intf(ndev)) + return; + + if (ndev->if_type != IFT_ETHER) { + if (ndev->if_type == IFT_L2VLAN) { + ndev = rdma_vlan_dev_real_dev(ndev); + if (ndev != NULL) + goto retry; + } + return; + } + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); + return; + } + + INIT_WORK(&work->work, roce_gid_queue_scan_event_handler); + dev_hold(ndev); + + work->ndev = ndev; + + queue_work(roce_gid_mgmt_wq, &work->work); +} + +static int +inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *ndev = ptr; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGEIFADDR: + roce_gid_queue_scan_event(ndev); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nb_inetaddr = { + .notifier_call = inetaddr_event +}; + +static void +roce_rescan_device_handler(struct work_struct *_work) +{ + struct roce_rescan_work *work = + container_of(_work, struct roce_rescan_work, work); + + ib_enum_roce_netdev(work->ib_dev, roce_gid_match_all, NULL, + roce_gid_update_addr_callback, NULL); + kfree(work); +} + +/* Caller must flush system workqueue before removing the ib_device */ +int roce_rescan_device(struct ib_device *ib_dev) +{ + struct roce_rescan_work *work = kmalloc(sizeof(*work), GFP_KERNEL); + + if (!work) + return -ENOMEM; + + work->ib_dev = ib_dev; + INIT_WORK(&work->work, roce_rescan_device_handler); + queue_work(roce_gid_mgmt_wq, &work->work); + + return 0; +} + +int __init roce_gid_mgmt_init(void) +{ + roce_gid_mgmt_wq = alloc_ordered_workqueue("roce_gid_mgmt_wq", 0); + if (!roce_gid_mgmt_wq) { + pr_warn("roce_gid_mgmt: can't allocate work queue\n"); + return -ENOMEM; + } + + register_inetaddr_notifier(&nb_inetaddr); + + /* + * We rely on the netdevice notifier to enumerate all existing + * devices in the system. Register to this notifier last to + * make sure we will not miss any IP add/del callbacks. + */ + register_netdevice_notifier(&nb_inetaddr); + + return 0; +} + +void __exit roce_gid_mgmt_cleanup(void) +{ + unregister_inetaddr_notifier(&nb_inetaddr); + unregister_netdevice_notifier(&nb_inetaddr); + + /* + * Ensure all gid deletion tasks complete before we go down, + * to avoid any reference to free'd memory. By the time + * ib-core is removed, all physical devices have been removed, + * so no issue with remaining hardware contexts. + */ + synchronize_rcu(); + drain_workqueue(roce_gid_mgmt_wq); + destroy_workqueue(roce_gid_mgmt_wq); +} Index: sys/ofed/drivers/infiniband/core/ib_sa_query.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_sa_query.c +++ sys/ofed/drivers/infiniband/core/ib_sa_query.c @@ -41,14 +41,18 @@ #include #include #include - +#include #include #include +#include +#include +#include #include "sa.h" +#include "core_priv.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand subnet administration query support"); -MODULE_LICENSE("Dual BSD/GPL"); +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 +#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 struct ib_sa_sm_ah { struct ib_ah *ah; @@ -57,10 +61,17 @@ u8 src_path_mask; }; +struct ib_sa_classport_cache { + bool valid; + struct ib_class_port_info data; +}; + struct ib_sa_port { struct ib_mad_agent *agent; struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; + struct ib_sa_classport_cache classport_info; + spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; u8 port_num; }; @@ -79,8 +90,16 @@ struct ib_mad_send_buf *mad_buf; struct ib_sa_sm_ah *sm_ah; int id; + u32 flags; + struct list_head list; /* Local svc request list */ + u32 seq; /* Local svc request sequence number */ + unsigned long timeout; /* Local svc timeout */ + u8 path_use; /* How will the pathrecord be used */ }; +#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 +#define IB_SA_CANCEL 0x00000002 + struct ib_sa_service_query { void (*callback)(int, struct ib_sa_service_rec *, void *); void *context; @@ -99,6 +118,12 @@ struct ib_sa_query sa_query; }; +struct ib_sa_classport_info_query { + void (*callback)(int, struct ib_class_port_info *, void *); + void *context; + struct ib_sa_query sa_query; +}; + struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; @@ -106,7 +131,7 @@ }; static void ib_sa_add_one(struct ib_device *device); -static void ib_sa_remove_one(struct ib_device *device); +static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { .name = "sa", @@ -352,6 +377,82 @@ .size_bits = 2*64 }, }; +#define CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ + .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .field_name = "ib_class_port_info:" #field + +static const struct ib_field classport_info_rec_table[] = { + { CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(capability_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 7, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(redirect_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(trap_tcslfl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(trap_hlqp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, +}; + #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ @@ -380,6 +481,11 @@ .size_bits = 512 }, }; +static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) +{ + query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; +} + static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -397,13 +503,12 @@ struct ib_ah_attr ah_attr; if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { - printk(KERN_WARNING "Couldn't query port\n"); + pr_warn("Couldn't query port\n"); return; } new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL); if (!new_ah) { - printk(KERN_WARNING "Couldn't allocate new SM AH\n"); return; } @@ -413,16 +518,21 @@ new_ah->pkey_index = 0; if (ib_find_pkey(port->agent->device, port->port_num, IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) - printk(KERN_ERR "Couldn't find index for default PKey\n"); + pr_err("Couldn't find index for default PKey\n"); memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = port_attr.sm_lid; ah_attr.sl = port_attr.sm_sl; ah_attr.port_num = port->port_num; + if (port_attr.grh_required) { + ah_attr.ah_flags = IB_AH_GRH; + ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(port_attr.subnet_prefix); + ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID); + } new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); if (IS_ERR(new_ah->ah)) { - printk(KERN_WARNING "Couldn't create new SM AH\n"); + pr_warn("Couldn't create new SM AH\n"); kfree(new_ah); return; } @@ -449,7 +559,7 @@ struct ib_sa_port *port = &sa_dev->port[event->element.port_num - sa_dev->start_port]; - if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_sa(handler->device, port->port_num)) return; spin_lock_irqsave(&port->ah_lock, flags); @@ -458,6 +568,13 @@ port->sm_ah = NULL; spin_unlock_irqrestore(&port->ah_lock, flags); + if (event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_LID_CHANGE) { + spin_lock_irqsave(&port->classport_lock, flags); + port->classport_info.valid = false; + spin_unlock_irqrestore(&port->classport_lock, flags); + } queue_work(ib_wq, &sa_dev->port[event->element.port_num - sa_dev->start_port].update_task); } @@ -500,8 +617,6 @@ agent = query->port->agent; mad_buf = query->mad_buf; spin_unlock_irqrestore(&idr_lock, flags); - - ib_cancel_mad(agent, mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); @@ -529,7 +644,8 @@ { int ret; u16 gid_index; - int force_grh; + int use_roce; + struct net_device *ndev = NULL; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); @@ -539,29 +655,87 @@ ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET; + use_roce = rdma_cap_eth_ah(device, port_num); + + if (use_roce) { + struct net_device *idev; + struct net_device *resolved_dev; + struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex, + .net = rec->net ? rec->net : + &init_net}; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); + + /* validate the route */ + ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, + &dgid_addr._sockaddr, &dev_addr); + if (ret) + return ret; - if (rec->hop_limit > 1 || force_grh) { + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return -EINVAL; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + resolved_dev = dev_get_by_index(dev_addr.net, + dev_addr.bound_dev_if); + if (resolved_dev->if_flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + ndev = ib_get_ndev_from_path(rec); + rcu_read_lock(); + if ((ndev && ndev != resolved_dev) || + (resolved_dev != idev && + !rdma_is_upper_dev_rcu(idev, resolved_dev))) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) { + if (ndev) + dev_put(ndev); + return ret; + } + } + + if (rec->hop_limit > 0 || use_roce) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, &port_num, - &gid_index); - if (ret) + ret = ib_find_cached_gid_by_port(device, &rec->sgid, + rec->gid_type, port_num, ndev, + &gid_index); + if (ret) { + if (ndev) + dev_put(ndev); return ret; + } ah_attr->grh.sgid_index = gid_index; ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; + if (ndev) + dev_put(ndev); } - if (force_grh) { - memcpy(ah_attr->dmac, rec->dmac, 6); - ah_attr->vlan_id = rec->vlan_id; - } else { - memset(ah_attr->dmac, 0, 6); - ah_attr->vlan_id = 0xffff; - } + + if (use_roce) + memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); return 0; } @@ -583,7 +757,8 @@ query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, - gfp_mask); + gfp_mask, + IB_MGMT_BASE_VERSION); if (IS_ERR(query->mad_buf)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -ENOMEM; @@ -618,24 +793,30 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) { + bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; -retry: - if (!idr_pre_get(&query_idr, gfp_mask)) - return -ENOMEM; + if (preload) + idr_preload(gfp_mask); spin_lock_irqsave(&idr_lock, flags); - ret = idr_get_new(&query_idr, query, &id); + + id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); + spin_unlock_irqrestore(&idr_lock, flags); - if (ret == -EAGAIN) - goto retry; - if (ret) - return ret; + if (preload) + idr_preload_end(); + if (id < 0) + return id; query->mad_buf->timeout_ms = timeout_ms; query->mad_buf->context[0] = query; query->id = id; + if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) { + ib_sa_disable_local_svc(query); + } + ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { spin_lock_irqsave(&idr_lock, flags); @@ -657,6 +838,12 @@ } EXPORT_SYMBOL(ib_sa_unpack_path); +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute) +{ + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); +} +EXPORT_SYMBOL(ib_sa_pack_path); + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) @@ -669,10 +856,10 @@ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); - rec.vlan_id = 0xffff; - memset(rec.dmac, 0, ETH_ALEN); - memset(rec.smac, 0, ETH_ALEN); - + rec.net = NULL; + rec.ifindex = 0; + rec.gid_type = IB_GID_TYPE_IB; + eth_zero_addr(rec.dmac); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); @@ -683,7 +870,6 @@ kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); } - /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client @@ -710,15 +896,15 @@ * the query. */ int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_path_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) { struct ib_sa_path_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); @@ -733,7 +919,7 @@ port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -760,6 +946,9 @@ *sa_query = &query->sa_query; + query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; + query->sa_query.mad_buf->context[1] = rec; + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; @@ -855,7 +1044,7 @@ method != IB_SA_METHOD_DELETE) return -EINVAL; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -947,7 +1136,7 @@ port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; @@ -993,8 +1182,8 @@ /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) + int status, + struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); @@ -1027,7 +1216,7 @@ { struct ib_sa_guidinfo_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; + struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; @@ -1044,19 +1233,19 @@ port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - query = kmalloc(sizeof *query, gfp_mask); + query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; - query->sa_query.port = port; + query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; - query->callback = callback; - query->context = context; + query->callback = callback; + query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); @@ -1090,6 +1279,121 @@ } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); +/* Support get SA ClassPortInfo */ +static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + unsigned long flags; + struct ib_sa_classport_info_query *query = + container_of(sa_query, struct ib_sa_classport_info_query, sa_query); + + if (mad) { + struct ib_class_port_info rec; + + ib_unpack(classport_info_rec_table, + ARRAY_SIZE(classport_info_rec_table), + mad->data, &rec); + + spin_lock_irqsave(&sa_query->port->classport_lock, flags); + if (!status && !sa_query->port->classport_info.valid) { + memcpy(&sa_query->port->classport_info.data, &rec, + sizeof(sa_query->port->classport_info.data)); + + sa_query->port->classport_info.valid = true; + } + spin_unlock_irqrestore(&sa_query->port->classport_lock, flags); + + query->callback(status, &rec, query->context); + } else { + query->callback(status, NULL, query->context); + } +} + +static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_classport_info_query, + sa_query)); +} + +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_classport_info_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + struct ib_class_port_info cached_class_port_info; + int ret; + unsigned long flags; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + /* Use cached ClassPortInfo attribute if valid instead of sending mad */ + spin_lock_irqsave(&port->classport_lock, flags); + if (port->classport_info.valid && callback) { + memcpy(&cached_class_port_info, &port->classport_info.data, + sizeof(cached_class_port_info)); + spin_unlock_irqrestore(&port->classport_lock, flags); + callback(0, &cached_class_port_info, context); + return 0; + } + spin_unlock_irqrestore(&port->classport_lock, flags); + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL; + + query->sa_query.release = ib_sa_portclass_info_rec_release; + /* support GET only */ + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); + mad->sa_hdr.comp_mask = 0; + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_classport_info_rec_query); + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { @@ -1122,14 +1426,15 @@ } static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; - struct ib_mad_send_buf *mad_buf; - mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id; - query = mad_buf->context[0]; + if (!send_buf) + return; + query = send_buf->context[0]; if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, @@ -1147,16 +1452,10 @@ { struct ib_sa_device *sa_dev; int s, e, i; + int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - - if (device->node_type == RDMA_NODE_IB_SWITCH) - s = e = 0; - else { - s = 1; - e = device->phys_port_cnt; - } + s = rdma_start_port(device); + e = rdma_end_port(device); sa_dev = kzalloc(sizeof *sa_dev + (e - s + 1) * sizeof (struct ib_sa_port), @@ -1169,22 +1468,30 @@ for (i = 0; i <= e - s; ++i) { spin_lock_init(&sa_dev->port[i].ah_lock); - if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND) + if (!rdma_cap_ib_sa(device, i + 1)) continue; sa_dev->port[i].sm_ah = NULL; sa_dev->port[i].port_num = i + s; + spin_lock_init(&sa_dev->port[i].classport_lock); + sa_dev->port[i].classport_info.valid = false; + sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, - recv_handler, sa_dev); + recv_handler, sa_dev, 0); if (IS_ERR(sa_dev->port[i].agent)) goto err; INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); + + count++; } + if (!count) + goto free; + ib_set_client_data(device, &sa_client, sa_dev); /* @@ -1196,31 +1503,28 @@ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); if (ib_register_event_handler(&sa_dev->event_handler)) - goto reg_err; + goto err; - for (i = 0; i <= e - s; ++i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + for (i = 0; i <= e - s; ++i) { + if (rdma_cap_ib_sa(device, i + 1)) update_sm_ah(&sa_dev->port[i].update_task); + } return; -reg_err: - ib_set_client_data(device, &sa_client, NULL); - i = e - s; err: - for (; i >= 0; --i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND && - !IS_ERR(sa_dev->port[i].agent)) - ib_unregister_mad_agent(sa_dev->port[i].agent); - + while (--i >= 0) { + if (rdma_cap_ib_sa(device, i + 1)) + ib_unregister_mad_agent(sa_dev->port[i].agent); + } +free: kfree(sa_dev); - return; } -static void ib_sa_remove_one(struct ib_device *device) +static void ib_sa_remove_one(struct ib_device *device, void *client_data) { - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_device *sa_dev = client_data; int i; if (!sa_dev) @@ -1231,7 +1535,7 @@ flush_workqueue(ib_wq); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_cap_ib_sa(device, i + 1)) { ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); @@ -1242,7 +1546,7 @@ kfree(sa_dev); } -static int __init ib_sa_init(void) +int ib_sa_init(void) { int ret; @@ -1250,29 +1554,27 @@ ret = ib_register_client(&sa_client); if (ret) { - printk(KERN_ERR "Couldn't register ib_sa client\n"); + pr_err("Couldn't register ib_sa client\n"); goto err1; } ret = mcast_init(); if (ret) { - printk(KERN_ERR "Couldn't initialize multicast handling\n"); + pr_err("Couldn't initialize multicast handling\n"); goto err2; } return 0; + err2: ib_unregister_client(&sa_client); err1: return ret; } -static void __exit ib_sa_cleanup(void) +void ib_sa_cleanup(void) { mcast_cleanup(); ib_unregister_client(&sa_client); idr_destroy(&query_idr); } - -module_init_order(ib_sa_init, SI_ORDER_SECOND); -module_exit(ib_sa_cleanup); Index: sys/ofed/drivers/infiniband/core/ib_smi.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_smi.c @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include "smi.h" +#include "opa_smi.h" + +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + const u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!direction) { + /* C14-9:1 */ + if (hop_cnt && *hop_ptr == 0) { + (*hop_ptr)++; + return (initial_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:2 */ + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + /* return_path set when received */ + (*hop_ptr)++; + return (initial_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == hop_cnt) { + /* return_path set when received */ + (*hop_ptr)++; + return (is_switch || + dr_dlid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- Fail unreasonable hop pointer */ + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + /* C14-13:1 */ + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- at the end of the DR segment of path */ + if (*hop_ptr == 1) { + (*hop_ptr)--; + /* C14-13:3 -- SMPs destined for SM shouldn't be here */ + return (is_switch || + dr_slid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */ + if (*hop_ptr == 0) + return IB_SMI_HANDLE; + + /* C14-13:5 -- Check for unreasonable hop pointer */ + return IB_SMI_DISCARD; + } +} + +/* + * Fixup a directed route SMP for sending + * Return IB_SMI_DISCARD if the SMP should be discarded + */ +enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, + bool is_switch, int port_num) +{ + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, int port_num) +{ + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, + int phys_port_cnt, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!direction) { + /* C14-9:1 -- sender should have incremented hop_ptr */ + if (hop_cnt && *hop_ptr == 0) + return IB_SMI_DISCARD; + + /* C14-9:2 -- intermediate hop */ + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ + return (initial_path[*hop_ptr+1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == hop_cnt) { + if (hop_cnt) + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ + + return (is_switch || + dr_dlid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- fail unreasonable hop pointer */ + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + + /* C14-13:1 */ + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + /* hop_ptr updated when sending */ + return (return_path[*hop_ptr-1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == 1) { + if (dr_slid_is_permissive) { + /* giving SMP to SM - update hop_ptr */ + (*hop_ptr)--; + return IB_SMI_HANDLE; + } + /* hop_ptr updated when sending */ + return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> give to SM */ + /* C14-13:5 -- Check for unreasonable hop pointer */ + return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } +} + +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, + int port_num, int phys_port_cnt) +{ + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + int port_num, int phys_port_cnt) +{ + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + if (!direction) { + /* C14-9:2 -- intermediate hop */ + if (hop_ptr && hop_ptr < hop_cnt) + return IB_SMI_FORWARD; + + /* C14-9:3 -- at the end of the DR segment of path */ + if (hop_ptr == hop_cnt) + return (dr_dlid_is_permissive ? + IB_SMI_SEND : IB_SMI_LOCAL); + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + if (hop_ptr == hop_cnt + 1) + return IB_SMI_SEND; + } else { + /* C14-13:2 -- intermediate hop */ + if (2 <= hop_ptr && hop_ptr <= hop_cnt) + return IB_SMI_FORWARD; + + /* C14-13:3 -- at the end of the DR segment of path */ + if (hop_ptr == 1) + return (!dr_slid_is_permissive ? + IB_SMI_SEND : IB_SMI_LOCAL); + } + return IB_SMI_LOCAL; + +} + +enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int smi_get_fwd_port(struct ib_smp *smp) +{ + return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] : + smp->return_path[smp->hop_ptr-1]); +} + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int opa_smi_get_fwd_port(struct opa_smp *smp) +{ + return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] : + smp->route.dr.return_path[smp->hop_ptr-1]; +} Index: sys/ofed/drivers/infiniband/core/ib_sysfs.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_sysfs.c @@ -0,0 +1,1327 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include +#include +#include +#include + +#include +#include + +struct ib_port; + +struct gid_attr_group { + struct ib_port *port; + struct kobject kobj; + struct attribute_group ndev; + struct attribute_group type; +}; +struct ib_port { + struct kobject kobj; + struct ib_device *ibdev; + struct gid_attr_group *gid_attr_group; + struct attribute_group gid_group; + struct attribute_group pkey_group; + struct attribute_group *pma_table; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; + u8 port_num; +}; + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct ib_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +#define PORT_ATTR(_name, _mode, _show, _store) \ +struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store) + +#define PORT_ATTR_RO(_name) \ +struct port_attribute port_attr_##_name = __ATTR_RO(_name) + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; + __be16 attr_id; +}; + +struct hw_stats_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count); + int index; + u8 port_num; +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show +}; + +static ssize_t gid_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct gid_attr_group, + kobj)->port; + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops gid_attr_sysfs_ops = { + .show = gid_attr_show +}; + +static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + static const char *state_name[] = { + [IB_PORT_NOP] = "NOP", + [IB_PORT_DOWN] = "DOWN", + [IB_PORT_INIT] = "INIT", + [IB_PORT_ARMED] = "ARMED", + [IB_PORT_ACTIVE] = "ACTIVE", + [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" + }; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d: %s\n", attr.state, + attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? + state_name[attr.state] : "UNKNOWN"); +} + +static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", attr.lid); +} + +static ssize_t lid_mask_count_show(struct ib_port *p, + struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d\n", attr.lmc); +} + +static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%x\n", attr.sm_lid); +} + +static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d\n", attr.sm_sl); +} + +static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "0x%08x\n", attr.port_cap_flags); +} + +static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + char *speed = ""; + int rate; /* in deci-Gb/sec */ + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + switch (attr.active_speed) { + case IB_SPEED_DDR: + speed = " DDR"; + rate = 50; + break; + case IB_SPEED_QDR: + speed = " QDR"; + rate = 100; + break; + case IB_SPEED_FDR10: + speed = " FDR10"; + rate = 100; + break; + case IB_SPEED_FDR: + speed = " FDR"; + rate = 140; + break; + case IB_SPEED_EDR: + speed = " EDR"; + rate = 250; + break; + case IB_SPEED_SDR: + default: /* default to SDR for invalid rates */ + rate = 25; + break; + } + + rate *= ib_width_enum_to_int(attr.active_width); + if (rate < 0) + return -EINVAL; + + return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", + rate / 10, rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), speed); +} + +static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + struct ib_port_attr attr; + + ssize_t ret; + + ret = ib_query_port(p->ibdev, p->port_num, &attr); + if (ret) + return ret; + + switch (attr.phys_state) { + case 1: return sprintf(buf, "1: Sleep\n"); + case 2: return sprintf(buf, "2: Polling\n"); + case 3: return sprintf(buf, "3: Disabled\n"); + case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); + case 5: return sprintf(buf, "5: LinkUp\n"); + case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); + case 7: return sprintf(buf, "7: Phy Test\n"); + default: return sprintf(buf, "%d: \n", attr.phys_state); + } +} + +static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, + char *buf) +{ + switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + return sprintf(buf, "%s\n", "InfiniBand"); + case IB_LINK_LAYER_ETHERNET: + return sprintf(buf, "%s\n", "Ethernet"); + default: + return sprintf(buf, "%s\n", "Unknown"); + } +} + +static PORT_ATTR_RO(state); +static PORT_ATTR_RO(lid); +static PORT_ATTR_RO(lid_mask_count); +static PORT_ATTR_RO(sm_lid); +static PORT_ATTR_RO(sm_sl); +static PORT_ATTR_RO(cap_mask); +static PORT_ATTR_RO(rate); +static PORT_ATTR_RO(phys_state); +static PORT_ATTR_RO(link_layer); + +static struct attribute *port_default_attrs[] = { + &port_attr_state.attr, + &port_attr_lid.attr, + &port_attr_lid_mask_count.attr, + &port_attr_sm_lid.attr, + &port_attr_sm_sl.attr, + &port_attr_cap_mask.attr, + &port_attr_rate.attr, + &port_attr_phys_state.attr, + &port_attr_link_layer.attr, + NULL +}; + +static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) +{ + if (!gid_attr->ndev) + return -EINVAL; + + return sprintf(buf, "%s\n", if_name(gid_attr->ndev)); +} + +static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf) +{ + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); +} + +static ssize_t _show_port_gid_attr(struct ib_port *p, + struct port_attribute *attr, + char *buf, + size_t (*print)(struct ib_gid_attr *gid_attr, + char *buf)) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + struct ib_gid_attr gid_attr = {}; + ssize_t ret; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, + &gid_attr); + if (ret) + goto err; + + ret = print(&gid_attr, buf); + +err: + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + return ret; +} + +static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + ssize_t ret; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL); + if (ret) + return ret; + + return sprintf(buf, GID_PRINT_FMT"\n", GID_PRINT_ARGS(gid.raw)); +} + +static ssize_t show_port_gid_attr_ndev(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_ndev); +} + +static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, + struct port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_gid_type); +} + +static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + u16 pkey; + ssize_t ret; + + ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ + .attr_id = IB_PMA_PORT_COUNTERS , \ +} + +#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16), \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT , \ +} + +/* + * Get a Perfmgmt MAD block of data. + * Returns error code or the number of bytes retrieved. + */ +static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, + void *data, int offset, size_t size) +{ + struct ib_mad *in_mad; + struct ib_mad *out_mad; + size_t mad_size = sizeof(*out_mad); + u16 out_mad_pkey_index = 0; + ssize_t ret; + + if (!dev->process_mad) + return -ENOSYS; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + ret = -ENOMEM; + goto out; + } + + in_mad->mad_hdr.base_version = 1; + in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; + in_mad->mad_hdr.class_version = 1; + in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; + in_mad->mad_hdr.attr_id = attr; + + if (attr != IB_PMA_CLASS_PORT_INFO) + in_mad->data[41] = port_num; /* PortSelect field */ + + if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY, + port_num, NULL, NULL, + (const struct ib_mad_hdr *)in_mad, mad_size, + (struct ib_mad_hdr *)out_mad, &mad_size, + &out_mad_pkey_index) & + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { + ret = -EINVAL; + goto out; + } + memcpy(data, out_mad->data + offset, size); + ret = size; +out: + kfree(in_mad); + kfree(out_mad); + return ret; +} + +static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + ssize_t ret; + u8 data[8]; + + ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, + 40 + offset / 8, sizeof(data)); + if (ret < 0) + return sprintf(buf, "N/A (no PMA)\n"); + + switch (width) { + case 4: + ret = sprintf(buf, "%u\n", (*data >> + (4 - (offset % 8))) & 0xf); + break; + case 8: + ret = sprintf(buf, "%u\n", *data); + break; + case 16: + ret = sprintf(buf, "%u\n", + be16_to_cpup((__be16 *)data)); + break; + case 32: + ret = sprintf(buf, "%u\n", + be32_to_cpup((__be32 *)data)); + break; + case 64: + ret = sprintf(buf, "%llu\n", + (unsigned long long)be64_to_cpup((__be64 *)data)); + break; + + default: + ret = 0; + } + + return ret; +} + +static PORT_PMA_ATTR(symbol_error , 0, 16, 32); +static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); +static PORT_PMA_ATTR(link_downed , 2, 8, 56); +static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); +static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); +static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); +static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); +static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); +static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); +static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); +static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); +static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); +static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); +static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); +static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); +static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); + +/* + * Counters added by extended set + */ +static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256); +static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320); +static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384); +static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448); +static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512); + +static struct attribute *pma_attrs[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_port_xmit_data.attr.attr, + &port_pma_attr_port_rcv_data.attr.attr, + &port_pma_attr_port_xmit_packets.attr.attr, + &port_pma_attr_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_multicast_rcv_packets.attr.attr, + &port_pma_attr_ext_multicast_xmit_packets.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_noietf[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + NULL +}; + +static struct attribute_group pma_group = { + .name = "counters", + .attrs = pma_attrs +}; + +static struct attribute_group pma_group_ext = { + .name = "counters", + .attrs = pma_attrs_ext +}; + +static struct attribute_group pma_group_noietf = { + .name = "counters", + .attrs = pma_attrs_noietf +}; + +static void ib_port_release(struct kobject *kobj) +{ + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + struct attribute *a; + int i; + + if (p->gid_group.attrs) { + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + + kfree(p->gid_group.attrs); + } + + if (p->pkey_group.attrs) { + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + + kfree(p->pkey_group.attrs); + } + + kfree(p); +} + +static void ib_port_gid_attr_release(struct kobject *kobj) +{ + struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, + kobj); + struct attribute *a; + int i; + + if (g->ndev.attrs) { + for (i = 0; (a = g->ndev.attrs[i]); ++i) + kfree(a); + + kfree(g->ndev.attrs); + } + + if (g->type.attrs) { + for (i = 0; (a = g->type.attrs[i]); ++i) + kfree(a); + + kfree(g->type.attrs); + } + + kfree(g); +} + +static struct kobj_type port_type = { + .release = ib_port_release, + .sysfs_ops = &port_sysfs_ops, + .default_attrs = port_default_attrs +}; + +static struct kobj_type gid_attr_type = { + .sysfs_ops = &gid_attr_sysfs_ops, + .release = ib_port_gid_attr_release +}; + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct ib_port *, + struct port_attribute *, char *buf), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof(struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + + if (snprintf(element->name, sizeof(element->name), + "%d", i) >= sizeof(element->name)) { + kfree(element); + goto err; + } + + element->attr.attr.name = element->name; + element->attr.attr.mode = S_IRUGO; + element->attr.show = show; + element->index = i; + sysfs_attr_init(&element->attr.attr); + + tab_attr[i] = &element->attr.attr; + } + + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +/* + * Figure out which counter table to use depending on + * the device capabilities. + */ +static struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) +{ + struct ib_class_port_info cpi; + + if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO, + &cpi, 40, sizeof(cpi)) >= 0) { + if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH) + /* We have extended counters */ + return &pma_group_ext; + + if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF) + /* But not the IETF ones */ + return &pma_group_noietf; + } + + /* Fall back to normal counters */ + return &pma_group; +} + +static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, + u8 port_num, int index) +{ + int ret; + + if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan)) + return 0; + ret = dev->get_hw_stats(dev, stats, port_num, index); + if (ret < 0) + return ret; + if (ret == stats->num_counters) + stats->timestamp = jiffies; + + return 0; +} + +static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)stats->value[index]); +} + +static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ib_device *dev; + struct ib_port *port; + struct hw_stats_attribute *hsa; + struct rdma_hw_stats *stats; + int ret; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + dev = container_of((struct device *)kobj, + struct ib_device, dev); + stats = dev->hw_stats; + } else { + port = container_of(kobj, struct ib_port, kobj); + dev = port->ibdev; + stats = port->hw_stats; + } + ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); + if (ret) + return ret; + return print_hw_stat(stats, hsa->index, buf); +} + +static ssize_t show_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct hw_stats_attribute *hsa; + int msecs; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + msecs = jiffies_to_msecs(dev->hw_stats->lifespan); + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + msecs = jiffies_to_msecs(p->hw_stats->lifespan); + } + return sprintf(buf, "%d\n", msecs); +} + +static ssize_t set_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_attribute *hsa; + int msecs; + int jiffies; + int ret; + + ret = kstrtoint(buf, 10, &msecs); + if (ret) + return ret; + if (msecs < 0 || msecs > 10000) + return -EINVAL; + jiffies = msecs_to_jiffies(msecs); + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + dev->hw_stats->lifespan = jiffies; + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + p->hw_stats->lifespan = jiffies; + } + return count; +} + +static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group) +{ + struct attribute **attr; + + sysfs_remove_group(kobj, attr_group); + + for (attr = attr_group->attrs; *attr; attr++) + kfree(*attr); + kfree(attr_group); +} + +static struct attribute *alloc_hsa(int index, u8 port_num, const char *name) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = __DECONST(char *, name); + hsa->attr.mode = S_IRUGO; + hsa->show = show_hw_stats; + hsa->store = NULL; + hsa->index = index; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = name; + hsa->attr.mode = S_IWUSR | S_IRUGO; + hsa->show = show_stats_lifespan; + hsa->store = set_stats_lifespan; + hsa->index = 0; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static void setup_hw_stats(struct ib_device *device, struct ib_port *port, + u8 port_num) +{ + struct attribute_group *hsag; + struct rdma_hw_stats *stats; + int i, ret; + + stats = device->alloc_hw_stats(device, port_num); + + if (!stats) + return; + + if (!stats->names || stats->num_counters <= 0) + goto err_free_stats; + + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + hsag = kzalloc(sizeof(*hsag) + + sizeof(void *) * (stats->num_counters + 2), + GFP_KERNEL); + if (!hsag) + goto err_free_stats; + + ret = device->get_hw_stats(device, stats, port_num, + stats->num_counters); + if (ret != stats->num_counters) + goto err_free_hsag; + + stats->timestamp = jiffies; + + hsag->name = "hw_counters"; + hsag->attrs = (void *)((char *)hsag + sizeof(*hsag)); + + for (i = 0; i < stats->num_counters; i++) { + hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]); + if (!hsag->attrs[i]) + goto err; + sysfs_attr_init(hsag->attrs[i]); + } + + /* treat an error here as non-fatal */ + hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); + if (hsag->attrs[i]) + sysfs_attr_init(hsag->attrs[i]); + + if (port) { + struct kobject *kobj = &port->kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + port->hw_stats_ag = hsag; + port->hw_stats = stats; + } else { + struct kobject *kobj = &device->dev.kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + device->hw_stats_ag = hsag; + device->hw_stats = stats; + } + + return; + +err: + for (; i >= 0; i--) + kfree(hsag->attrs[i]); +err_free_hsag: + kfree(hsag); +err_free_stats: + kfree(stats); + return; +} + +static int add_port(struct ib_device *device, int port_num, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) +{ + struct ib_port *p; + struct ib_port_attr attr; + int i; + int ret; + + ret = ib_query_port(device, port_num, &attr); + if (ret) + return ret; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->ibdev = device; + p->port_num = port_num; + + ret = kobject_init_and_add(&p->kobj, &port_type, + device->ports_parent, + "%d", port_num); + if (ret) { + kfree(p); + return ret; + } + + p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); + if (!p->gid_attr_group) { + ret = -ENOMEM; + goto err_put; + } + + p->gid_attr_group->port = p; + ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, + &p->kobj, "gid_attrs"); + if (ret) { + kfree(p->gid_attr_group); + goto err_put; + } + + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; + + p->gid_group.name = "gids"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); + if (!p->gid_group.attrs) { + ret = -ENOMEM; + goto err_remove_pma; + } + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + p->gid_attr_group->ndev.name = "ndevs"; + p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, + attr.gid_tbl_len); + if (!p->gid_attr_group->ndev.attrs) { + ret = -ENOMEM; + goto err_remove_gid; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + if (ret) + goto err_free_gid_ndev; + + p->gid_attr_group->type.name = "types"; + p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, + attr.gid_tbl_len); + if (!p->gid_attr_group->type.attrs) { + ret = -ENOMEM; + goto err_remove_gid_ndev; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + if (ret) + goto err_free_gid_type; + + p->pkey_group.name = "pkeys"; + p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, + attr.pkey_tbl_len); + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_remove_gid_type; + } + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + if (port_callback) { + ret = port_callback(device, port_num, &p->kobj); + if (ret) + goto err_remove_pkey; + } + + /* + * If port == 0, it means we have only one port and the parent + * device, not this port device, should be the holder of the + * hw_counters + */ + if (device->alloc_hw_stats && port_num) + setup_hw_stats(device, p, port_num); + + list_add_tail(&p->kobj.entry, &device->port_list); + + return 0; + +err_remove_pkey: + sysfs_remove_group(&p->kobj, &p->pkey_group); + +err_free_pkey: + for (i = 0; i < attr.pkey_tbl_len; ++i) + kfree(p->pkey_group.attrs[i]); + + kfree(p->pkey_group.attrs); + p->pkey_group.attrs = NULL; + +err_remove_gid_type: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + +err_free_gid_type: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->type.attrs[i]); + + kfree(p->gid_attr_group->type.attrs); + p->gid_attr_group->type.attrs = NULL; + +err_remove_gid_ndev: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + +err_free_gid_ndev: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->ndev.attrs[i]); + + kfree(p->gid_attr_group->ndev.attrs); + p->gid_attr_group->ndev.attrs = NULL; + +err_remove_gid: + sysfs_remove_group(&p->kobj, &p->gid_group); + +err_free_gid: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_group.attrs[i]); + + kfree(p->gid_group.attrs); + p->gid_group.attrs = NULL; + +err_remove_pma: + sysfs_remove_group(&p->kobj, p->pma_table); + +err_put_gid_attrs: + kobject_put(&p->gid_attr_group->kobj); + +err_put: + kobject_put(&p->kobj); + return ret; +} + +static ssize_t show_node_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + switch (dev->node_type) { + case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); + case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); + case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); + case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); + case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); + default: return sprintf(buf, "%d: \n", dev->node_type); + } +} + +static ssize_t show_sys_image_guid(struct device *device, + struct device_attribute *dev_attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); +} + +static ssize_t show_node_guid(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) &dev->node_guid)[0]), + be16_to_cpu(((__be16 *) &dev->node_guid)[1]), + be16_to_cpu(((__be16 *) &dev->node_guid)[2]), + be16_to_cpu(((__be16 *) &dev->node_guid)[3])); +} + +static ssize_t show_node_desc(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + return sprintf(buf, "%.64s\n", dev->node_desc); +} + +static ssize_t set_node_desc(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device_modify desc = {}; + int ret; + + if (!dev->modify_device) + return -EIO; + + memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX)); + ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); + if (ret) + return ret; + + return count; +} + +static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + ib_get_device_fw_str(dev, buf, PAGE_SIZE); + strlcat(buf, "\n", PAGE_SIZE); + return strlen(buf); +} + +static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); +static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); +static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); +static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); + +static struct device_attribute *ib_class_attributes[] = { + &dev_attr_node_type, + &dev_attr_sys_image_guid, + &dev_attr_node_guid, + &dev_attr_node_desc, + &dev_attr_fw_ver, +}; + +static void free_port_list_attributes(struct ib_device *device) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &device->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); + if (port->hw_stats) { + kfree(port->hw_stats); + free_hsag(&port->kobj, port->hw_stats_ag); + } + sysfs_remove_group(p, port->pma_table); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->ndev); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->type); + kobject_put(&port->gid_attr_group->kobj); + kobject_put(p); + } + + kobject_put(device->ports_parent); +} + +int ib_device_register_sysfs(struct ib_device *device, + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) +{ + struct device *class_dev = &device->dev; + int ret; + int i; + + device->dev.parent = device->dma_device; + ret = dev_set_name(class_dev, "%s", device->name); + if (ret) + return ret; + + ret = device_add(class_dev); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { + ret = device_create_file(class_dev, ib_class_attributes[i]); + if (ret) + goto err_unregister; + } + + device->ports_parent = kobject_create_and_add("ports", + &class_dev->kobj); + if (!device->ports_parent) { + ret = -ENOMEM; + goto err_put; + } + + if (rdma_cap_ib_switch(device)) { + ret = add_port(device, 0, port_callback); + if (ret) + goto err_put; + } else { + for (i = 1; i <= device->phys_port_cnt; ++i) { + ret = add_port(device, i, port_callback); + if (ret) + goto err_put; + } + } + + if (device->alloc_hw_stats) + setup_hw_stats(device, NULL, 0); + + return 0; + +err_put: + free_port_list_attributes(device); + +err_unregister: + device_unregister(class_dev); + +err: + return ret; +} + +void ib_device_unregister_sysfs(struct ib_device *device) +{ + int i; + + /* Hold kobject until ib_dealloc_device() */ + kobject_get(&device->dev.kobj); + + free_port_list_attributes(device); + + if (device->hw_stats) { + kfree(device->hw_stats); + free_hsag(&device->dev.kobj, device->hw_stats_ag); + } + + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) + device_remove_file(&device->dev, ib_class_attributes[i]); + + device_unregister(&device->dev); +} Index: sys/ofed/drivers/infiniband/core/ib_ucm.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_ucm.c +++ sys/ofed/drivers/infiniband/core/ib_ucm.c @@ -43,10 +43,10 @@ #include #include #include -#include #include +#include #include #include #include @@ -108,7 +108,7 @@ #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) static void ib_ucm_add_one(struct ib_device *device); -static void ib_ucm_remove_one(struct ib_device *device); +static void ib_ucm_remove_one(struct ib_device *device, void *client_data); static struct ib_client ucm_client = { .name = "ucm", @@ -175,7 +175,6 @@ static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) { struct ib_ucm_context *ctx; - int result; ctx = kzalloc(sizeof *ctx, GFP_KERNEL); if (!ctx) @@ -186,17 +185,10 @@ ctx->file = file; INIT_LIST_HEAD(&ctx->events); - do { - result = idr_pre_get(&ctx_id_table, GFP_KERNEL); - if (!result) - goto error; - - mutex_lock(&ctx_id_mutex); - result = idr_get_new(&ctx_id_table, ctx, &ctx->id); - mutex_unlock(&ctx_id_mutex); - } while (result == -EAGAIN); - - if (result) + mutex_lock(&ctx_id_mutex); + ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&ctx_id_mutex); + if (ctx->id < 0) goto error; list_add_tail(&ctx->file_list, &file->ctxs); @@ -378,8 +370,6 @@ list_add_tail(&uevent->file_list, &ctx->file->events); list_add_tail(&uevent->ctx_list, &ctx->events); wake_up_interruptible(&ctx->file->poll_wait); - if (ctx->file->filp) - selwakeup(&ctx->file->filp->f_selinfo); mutex_unlock(&ctx->file->file_mutex); return 0; @@ -667,8 +657,7 @@ if (result) goto out; - result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask, - NULL); + result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask); out: ib_ucm_ctx_put(ctx); return result; @@ -703,14 +692,9 @@ if (!len) return 0; - data = kmalloc(len, GFP_KERNEL); - if (!data) - return -ENOMEM; - - if (copy_from_user(data, (void __user *)(unsigned long)src, len)) { - kfree(data); - return -EFAULT; - } + data = memdup_user((void __user *)(unsigned long)src, len); + if (IS_ERR(data)) + return PTR_ERR(data); *dest = data; return 0; @@ -1118,6 +1102,9 @@ struct ib_ucm_cmd_hdr hdr; ssize_t result; + if (WARN_ON_ONCE(!ib_safe_file_access(filp))) + return -EACCES; + if (len < sizeof(hdr)) return -EINVAL; @@ -1207,6 +1194,7 @@ return 0; } +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static void ib_ucm_release_dev(struct device *dev) { struct ib_ucm_device *ucm_dev; @@ -1214,17 +1202,17 @@ ucm_dev = container_of(dev, struct ib_ucm_device, dev); cdev_del(&ucm_dev->cdev); if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(ucm_dev->devnum, dev_map); + clear_bit(ucm_dev->devnum, dev_map); else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); + clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); kfree(ucm_dev); } static const struct file_operations ucm_fops = { - .owner = THIS_MODULE, - .open = ib_ucm_open, + .owner = THIS_MODULE, + .open = ib_ucm_open, .release = ib_ucm_close, - .write = ib_ucm_write, + .write = ib_ucm_write, .poll = ib_ucm_poll, .llseek = no_llseek, }; @@ -1240,7 +1228,6 @@ static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static int find_overflow_devnum(void) { int ret; @@ -1249,7 +1236,7 @@ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register dynamic device number\n"); + pr_err("ucm: couldn't register dynamic device number\n"); return ret; } } @@ -1267,8 +1254,7 @@ dev_t base; struct ib_ucm_device *ucm_dev; - if (!device->alloc_ucontext || - rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1)) return; ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); @@ -1281,7 +1267,7 @@ if (devnum >= IB_UCM_MAX_DEVICES) { devnum = find_overflow_devnum(); if (devnum < 0) - goto err; + goto err; ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; base = devnum + overflow_maj; @@ -1325,9 +1311,9 @@ return; } -static void ib_ucm_remove_one(struct ib_device *device) +static void ib_ucm_remove_one(struct ib_device *device, void *client_data) { - struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client); + struct ib_ucm_device *ucm_dev = client_data; if (!ucm_dev) return; @@ -1335,12 +1321,8 @@ device_unregister(&ucm_dev->dev); } -static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION); -} - -static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_CM_ABI_VERSION)); static int __init ib_ucm_init(void) { @@ -1349,25 +1331,25 @@ ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register device number\n"); + pr_err("ucm: couldn't register device number\n"); goto error1; } - ret = class_create_file(&cm_class, &class_attr_abi_version); + ret = class_create_file(&cm_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "ucm: couldn't create abi_version attribute\n"); + pr_err("ucm: couldn't create abi_version attribute\n"); goto error2; } ret = ib_register_client(&ucm_client); if (ret) { - printk(KERN_ERR "ucm: couldn't register client\n"); + pr_err("ucm: couldn't register client\n"); goto error3; } return 0; error3: - class_remove_file(&cm_class, &class_attr_abi_version); + class_remove_file(&cm_class, &class_attr_abi_version.attr); error2: unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); error1: @@ -1377,7 +1359,7 @@ static void __exit ib_ucm_cleanup(void) { ib_unregister_client(&ucm_client); - class_remove_file(&cm_class, &class_attr_abi_version); + class_remove_file(&cm_class, &class_attr_abi_version.attr); unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); if (overflow_maj) unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); Index: sys/ofed/drivers/infiniband/core/ib_ucma.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_ucma.c +++ sys/ofed/drivers/infiniband/core/ib_ucma.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -61,6 +63,7 @@ struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; + struct workqueue_struct *close_wq; }; struct ucma_context { @@ -76,6 +79,13 @@ struct list_head list; struct list_head mc_list; + /* mark that device is in process of destroying the internal HW + * resources, protected by the global mut + */ + int closing; + /* sync between removal event and id destroy, protected by file mut */ + int destroying; + struct work_struct close_work; }; struct ucma_multicast { @@ -84,6 +94,7 @@ int events_reported; u64 uid; + u8 join_state; struct list_head list; struct sockaddr_storage addr; }; @@ -94,6 +105,7 @@ struct list_head list; struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; + struct work_struct close_work; }; static DEFINE_MUTEX(mut); @@ -119,8 +131,12 @@ mutex_lock(&mut); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) - atomic_inc(&ctx->ref); + if (!IS_ERR(ctx)) { + if (ctx->closing) + ctx = ERR_PTR(-EIO); + else + atomic_inc(&ctx->ref); + } mutex_unlock(&mut); return ctx; } @@ -131,31 +147,46 @@ complete(&ctx->comp); } +static void ucma_close_event_id(struct work_struct *work) +{ + struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); + + rdma_destroy_id(uevent_close->cm_id); + kfree(uevent_close); +} + +static void ucma_close_id(struct work_struct *work) +{ + struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); + + /* once all inflight tasks are finished, we close all underlying + * resources. The context is still alive till its explicit destryoing + * by its creator. + */ + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + /* No new events will be generated after destroying the id. */ + rdma_destroy_id(ctx->cm_id); +} + static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; - int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + INIT_WORK(&ctx->close_work, ucma_close_id); atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); ctx->file = file; - do { - ret = idr_pre_get(&ctx_idr, GFP_KERNEL); - if (!ret) - goto error; - - mutex_lock(&mut); - ret = idr_get_new(&ctx_idr, ctx, &ctx->id); - mutex_unlock(&mut); - } while (ret == -EAGAIN); - - if (ret) + mutex_lock(&mut); + ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (ctx->id < 0) goto error; list_add_tail(&ctx->list, &file->ctx_list); @@ -169,23 +200,15 @@ static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc; - int ret; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; - do { - ret = idr_pre_get(&multicast_idr, GFP_KERNEL); - if (!ret) - goto error; - - mutex_lock(&mut); - ret = idr_get_new(&multicast_idr, mc, &mc->id); - mutex_unlock(&mut); - } while (ret == -EAGAIN); - - if (ret) + mutex_lock(&mut); + mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL); + mutex_unlock(&mut); + if (mc->id < 0) goto error; mc->ctx = ctx; @@ -233,8 +256,8 @@ switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: - uevent->mc = (struct ucma_multicast *) - event->param.ud.private_data; + uevent->mc = __DECONST(struct ucma_multicast *, + event->param.ud.private_data); uevent->resp.uid = uevent->mc->uid; uevent->resp.id = uevent->mc->id; break; @@ -245,6 +268,44 @@ } } +/* Called with file->mut locked for the relevant context. */ +static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +{ + struct ucma_context *ctx = cm_id->context; + struct ucma_event *con_req_eve; + int event_found = 0; + + if (ctx->destroying) + return; + + /* only if context is pointing to cm_id that it owns it and can be + * queued to be closed, otherwise that cm_id is an inflight one that + * is part of that context event list pending to be detached and + * reattached to its new context as part of ucma_get_event, + * handled separately below. + */ + if (ctx->cm_id == cm_id) { + mutex_lock(&mut); + ctx->closing = 1; + mutex_unlock(&mut); + queue_work(ctx->file->close_wq, &ctx->close_work); + return; + } + + list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { + if (con_req_eve->cm_id == cm_id && + con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { + list_del(&con_req_eve->list); + INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); + queue_work(ctx->file->close_wq, &con_req_eve->close_work); + event_found = 1; + break; + } + } + if (!event_found) + pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); +} + static int ucma_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -274,21 +335,26 @@ goto out; } ctx->backlog--; - } else if (!ctx->uid) { + } else if (!ctx->uid || ctx->cm_id != cm_id) { /* * We ignore events for new connections until userspace has set * their context. This can only happen if an error occurs on a * new connection before the user accepts it. This is okay, - * since the accept will just fail later. + * since the accept will just fail later. However, we do need + * to release the underlying HW resources in case of a device + * removal event. */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); + kfree(uevent); goto out; } list_add_tail(&uevent->list, &ctx->file->event_list); wake_up_interruptible(&ctx->file->poll_wait); - if (ctx->file->filp) - selwakeup(&ctx->file->filp->f_selinfo); + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); out: mutex_unlock(&ctx->file->mut); return ret; @@ -334,7 +400,6 @@ ctx->cm_id = uevent->cm_id; ctx->cm_id->context = ctx; uevent->resp.id = ctx->id; - ctx->cm_id->ucontext = ctx; } if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -372,7 +437,7 @@ } static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, - int in_len, int out_len) + int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; @@ -397,12 +462,12 @@ return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type); + ctx->cm_id = rdma_create_id(TD_TO_VNET(curthread), + ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; } - ctx->cm_id->ucontext = ctx; resp.id = ctx->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -449,9 +514,15 @@ } /* - * We cannot hold file->mut when calling rdma_destroy_id() or we can - * deadlock. We also acquire file->mut in ucma_event_handler(), and - * rdma_destroy_id() will wait until all callbacks have completed. + * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At + * this point, no new events will be reported from the hardware. However, we + * still need to cleanup the UCMA context for this ID. Specifically, there + * might be events that have not yet been consumed by the user space software. + * These might include pending connect requests which we have not completed + * processing. We cannot call rdma_destroy_id while holding the lock of the + * context (file->mut), as it might cause a deadlock. We therefore extract all + * relevant events from the context pending events list while holding the + * mutex. After that we release them as needed. */ static int ucma_free_ctx(struct ucma_context *ctx) { @@ -459,8 +530,6 @@ struct ucma_event *uevent, *tmp; LIST_HEAD(list); - /* No new events will be generated after destroying the id. */ - rdma_destroy_id(ctx->cm_id); ucma_cleanup_multicast(ctx); @@ -508,10 +577,24 @@ if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - resp.events_reported = ucma_free_ctx(ctx); + mutex_lock(&ctx->file->mut); + ctx->destroying = 1; + mutex_unlock(&ctx->file->mut); + flush_workqueue(ctx->file->close_wq); + /* At this point it's guaranteed that there is no inflight + * closing task */ + mutex_lock(&mut); + if (!ctx->closing) { + mutex_unlock(&mut); + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + rdma_destroy_id(ctx->cm_id); + } else { + mutex_unlock(&mut); + } + + resp.events_reported = ucma_free_ctx(ctx); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; @@ -519,10 +602,10 @@ return ret; } -static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf, +static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_bind_addr cmd; + struct rdma_ucm_bind_ip cmd; struct ucma_context *ctx; int ret; @@ -538,24 +621,75 @@ return ret; } +static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind cmd; + struct sockaddr *addr; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + addr = (struct sockaddr *) &cmd.addr; + if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_bind_addr(ctx->cm_id, addr); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_ip(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, + cmd.timeout_ms); + ucma_put_ctx(ctx); + return ret; +} + static ssize_t ucma_resolve_addr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; + struct sockaddr *src, *dst; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + src = (struct sockaddr *) &cmd.src_addr; + dst = (struct sockaddr *) &cmd.dst_addr; + if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) || + !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst))) + return -EINVAL; + ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, - (struct sockaddr *) &cmd.dst_addr, - cmd.timeout_ms); + ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms); ucma_put_ctx(ctx); return ret; } @@ -648,7 +782,7 @@ const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_query_route cmd; + struct rdma_ucm_query cmd; struct rdma_ucm_query_route_resp resp; struct ucma_context *ctx; struct sockaddr *addr; @@ -678,26 +812,13 @@ resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; - switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(ctx->cm_id->device, - ctx->cm_id->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - ucma_copy_ib_route(&resp, &ctx->cm_id->route); - break; - case IB_LINK_LAYER_ETHERNET: - ucma_copy_iboe_route(&resp, &ctx->cm_id->route); - break; - default: - break; - } - break; - case RDMA_TRANSPORT_IWARP: + + if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_ib_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_iboe_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iw_route(&resp, &ctx->cm_id->route); - break; - default: - break; - } out: if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -708,7 +829,162 @@ return ret; } -static void ucma_copy_conn_param(struct rdma_conn_param *dst, +static void ucma_query_device_addr(struct rdma_cm_id *cm_id, + struct rdma_ucm_query_addr_resp *resp) +{ + if (!cm_id->device) + return; + + resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->port_num = cm_id->port_num; + resp->pkey = (__force __u16) cpu_to_be16( + ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); +} + +static ssize_t ucma_query_addr(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + resp.src_size = rdma_addr_size(addr); + memcpy(&resp.src_addr, addr, resp.src_size); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + resp.dst_size = rdma_addr_size(addr); + memcpy(&resp.dst_addr, addr, resp.dst_size); + + ucma_query_device_addr(ctx->cm_id, &resp); + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query_path(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_path_resp *resp; + int i, ret = 0; + + if (out_len < sizeof(*resp)) + return -ENOSPC; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_paths = ctx->cm_id->route.num_paths; + for (i = 0, out_len -= sizeof(*resp); + i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); + i++, out_len -= sizeof(struct ib_path_rec_data)) { + + resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL; + ib_sa_pack_path(&ctx->cm_id->route.path_rec[i], + &resp->path_data[i].path_rec); + } + + if (copy_to_user(response, resp, + sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + ret = -EFAULT; + + kfree(resp); + return ret; +} + +static ssize_t ucma_query_gid(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr_ib *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + ucma_query_device_addr(ctx->cm_id, &resp); + + addr = (struct sockaddr_ib *) &resp.src_addr; + resp.src_size = sizeof(*addr); + if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.src_addr); + } + + addr = (struct sockaddr_ib *) &resp.dst_addr; + resp.dst_size = sizeof(*addr); + if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.dst_addr); + } + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct ucma_context *ctx; + void __user *response; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + response = (void __user *)(unsigned long) cmd.response; + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + switch (cmd.option) { + case RDMA_USER_CM_QUERY_ADDR: + ret = ucma_query_addr(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_PATH: + ret = ucma_query_path(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_GID: + ret = ucma_query_gid(ctx, response, out_len); + break; + default: + ret = -ENOSYS; + break; + } + + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_copy_conn_param(struct rdma_cm_id *id, + struct rdma_conn_param *dst, struct rdma_ucm_conn_param *src) { dst->private_data = src->private_data; @@ -720,6 +996,7 @@ dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; + dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, @@ -740,7 +1017,7 @@ if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); ret = rdma_connect(ctx->cm_id, &conn_param); ucma_put_ctx(ctx); return ret; @@ -783,7 +1060,7 @@ return PTR_ERR(ctx); if (cmd.conn_param.valid) { - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); ret = rdma_accept(ctx->cm_id, &conn_param); if (!ret) @@ -924,6 +1201,8 @@ if (!optlen) return -EINVAL; + memset(&sa_path, 0, sizeof(sa_path)); + ib_sa_unpack_path(path_data->path_rec, &sa_path); ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); if (ret) @@ -937,22 +1216,12 @@ static int ucma_set_option_ib(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { - int ret = 0; + int ret; switch (optname) { case RDMA_OPTION_IB_PATH: ret = ucma_set_ib_path(ctx, optval, optlen); break; - - case RDMA_OPTION_IB_APM: - if (optlen != sizeof(u8)) { - ret = -EINVAL; - break; - } - if (*(u8 *)optval) - ret = rdma_enable_apm(ctx->cm_id, RDMA_ALT_PATH_BEST); - break; - default: ret = -ENOSYS; } @@ -994,24 +1263,18 @@ if (IS_ERR(ctx)) return PTR_ERR(ctx); - optval = kmalloc(cmd.optlen, GFP_KERNEL); - if (!optval) { - ret = -ENOMEM; - goto err_ucma_put_ctx; - } - - if (copy_from_user(optval, (void __user *)(unsigned long)cmd.optval, - cmd.optlen)) { - ret = -EFAULT; - goto err_kfree; + optval = memdup_user((void __user *) (unsigned long) cmd.optval, + cmd.optlen); + if (IS_ERR(optval)) { + ret = PTR_ERR(optval); + goto out; } ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, cmd.optlen); - -err_kfree: kfree(optval); -err_ucma_put_ctx: + +out: ucma_put_ctx(ctx); return ret; } @@ -1035,23 +1298,31 @@ return ret; } -static ssize_t ucma_join_multicast(struct ucma_file *file, - const char __user *inbuf, - int in_len, int out_len) +static ssize_t ucma_process_join(struct ucma_file *file, + struct rdma_ucm_join_mcast *cmd, int out_len) { - struct rdma_ucm_join_mcast cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct ucma_multicast *mc; + struct sockaddr *addr; int ret; + u8 join_state; if (out_len < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; + addr = (struct sockaddr *) &cmd->addr; + if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + return -EINVAL; - ctx = ucma_get_ctx(file, cmd.id); + if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) + join_state = BIT(FULLMEMBER_JOIN); + else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) + join_state = BIT(SENDONLY_FULLMEMBER_JOIN); + else + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd->id); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -1061,15 +1332,16 @@ ret = -ENOMEM; goto err1; } - - mc->uid = cmd.uid; - memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr); - ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); + mc->join_state = join_state; + mc->uid = cmd->uid; + memcpy(&mc->addr, addr, cmd->addr_size); + ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, + join_state, mc); if (ret) goto err2; resp.id = mc->id; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user((void __user *)(unsigned long) cmd->response, &resp, sizeof(resp))) { ret = -EFAULT; goto err3; @@ -1094,6 +1366,38 @@ return ret; } +static ssize_t ucma_join_ip_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_ip_mcast cmd; + struct rdma_ucm_join_mcast join_cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + join_cmd.response = cmd.response; + join_cmd.uid = cmd.uid; + join_cmd.id = cmd.id; + join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); + join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; + memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); + + return ucma_process_join(file, &join_cmd, out_len); +} + +static ssize_t ucma_join_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_mcast cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + return ucma_process_join(file, &cmd, out_len); +} + static ssize_t ucma_leave_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -1115,10 +1419,10 @@ mc = ERR_PTR(-ENOENT); else if (mc->ctx->file != file) mc = ERR_PTR(-EINVAL); - else { + else if (!atomic_inc_not_zero(&mc->ctx->ref)) + mc = ERR_PTR(-ENXIO); + else idr_remove(&multicast_idr, mc->id); - atomic_inc(&mc->ctx->ref); - } mutex_unlock(&mut); if (IS_ERR(mc)) { @@ -1148,10 +1452,10 @@ /* Acquire mutex's based on pointer comparison to prevent deadlock. */ if (file1 < file2) { mutex_lock(&file1->mut); - mutex_lock(&file2->mut); + mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); } else { mutex_lock(&file2->mut); - mutex_lock(&file1->mut); + mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); } } @@ -1236,25 +1540,29 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { - [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, - [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, - [RDMA_USER_CM_CMD_BIND_ADDR] = ucma_bind_addr, - [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, - [RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route, - [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, - [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, - [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, - [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, - [RDMA_USER_CM_CMD_REJECT] = ucma_reject, - [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, - [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, - [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, - [RDMA_USER_CM_CMD_GET_OPTION] = NULL, - [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, - [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, - [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, - [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, - [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id + [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, + [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, + [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, + [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, + [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, + [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, + [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, + [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, + [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, + [RDMA_USER_CM_CMD_REJECT] = ucma_reject, + [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, + [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, + [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, + [RDMA_USER_CM_CMD_GET_OPTION] = NULL, + [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, + [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, + [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, + [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, + [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, + [RDMA_USER_CM_CMD_QUERY] = ucma_query, + [RDMA_USER_CM_CMD_BIND] = ucma_bind, + [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast }; static ssize_t ucma_write(struct file *filp, const char __user *buf, @@ -1264,6 +1572,9 @@ struct rdma_ucm_cmd_hdr hdr; ssize_t ret; + if (WARN_ON_ONCE(!ib_safe_file_access(filp))) + return -EACCES; + if (len < sizeof(hdr)) return -EINVAL; @@ -1315,6 +1626,13 @@ if (!file) return -ENOMEM; + file->close_wq = alloc_ordered_workqueue("ucma_close_id", + WQ_MEM_RECLAIM); + if (!file->close_wq) { + kfree(file); + return -ENOMEM; + } + INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); @@ -1333,16 +1651,34 @@ mutex_lock(&file->mut); list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { + ctx->destroying = 1; mutex_unlock(&file->mut); mutex_lock(&mut); idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); + flush_workqueue(file->close_wq); + /* At that step once ctx was marked as destroying and workqueue + * was flushed we are safe from any inflights handlers that + * might put other closing task. + */ + mutex_lock(&mut); + if (!ctx->closing) { + mutex_unlock(&mut); + /* rdma_destroy_id ensures that no event handlers are + * inflight for that id before releasing it. + */ + rdma_destroy_id(ctx->cm_id); + } else { + mutex_unlock(&mut); + } + ucma_free_ctx(ctx); mutex_lock(&file->mut); } mutex_unlock(&file->mut); + destroy_workqueue(file->close_wq); kfree(file); return 0; } @@ -1371,11 +1707,11 @@ }; static struct miscdevice ucma_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "rdma_cm", + .minor = MISC_DYNAMIC_MINOR, + .name = "rdma_cm", .nodename = "infiniband/rdma_cm", .mode = 0666, - .fops = &ucma_fops, + .fops = &ucma_fops, }; static ssize_t show_abi_version(struct device *dev, @@ -1396,7 +1732,7 @@ ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { - printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); + pr_err("rdma_ucm: couldn't create abi_version attr\n"); goto err1; } @@ -1411,6 +1747,7 @@ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); idr_destroy(&ctx_idr); + idr_destroy(&multicast_idr); } module_init(ucma_init); Index: sys/ofed/drivers/infiniband/core/ib_ud_header.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_ud_header.c +++ sys/ofed/drivers/infiniband/core/ib_ud_header.c @@ -33,11 +33,12 @@ #include #include -#include #include #include +#include + #define STRUCT_FIELD(header, field) \ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \ @@ -116,6 +117,72 @@ .size_bits = 16 } }; +static const struct ib_field ip4_table[] = { + { STRUCT_FIELD(ip4, ver), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, hdr_len), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, tos), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tot_len), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, id), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, frag_off), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, ttl), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, protocol), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, check), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, saddr), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(ip4, daddr), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 32 } +}; + +static const struct ib_field udp_table[] = { + { STRUCT_FIELD(udp, sport), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, dport), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(udp, length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, csum), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -213,28 +280,61 @@ .size_bits = 24 } }; +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header) +{ + struct ip iph; + + iph.ip_hl = 5; + iph.ip_v = 4; + iph.ip_tos = header->ip4.tos; + iph.ip_len = header->ip4.tot_len; + iph.ip_id = header->ip4.id; + iph.ip_off = header->ip4.frag_off; + iph.ip_ttl = header->ip4.ttl; + iph.ip_p = header->ip4.protocol; + iph.ip_sum = 0; + iph.ip_src.s_addr = header->ip4.saddr; + iph.ip_dst.s_addr = header->ip4.daddr; + + return in_cksum_hdr(&iph); +} +EXPORT_SYMBOL(ib_ud_ip4_csum); + /** * ib_ud_header_init - Initialize UD header structure * @payload_bytes:Length of packet payload * @lrh_present: specify if LRH is present * @eth_present: specify if Eth header is present * @vlan_present: packet is tagged vlan - * @grh_present:GRH flag (if non-zero, GRH will be included) + * @grh_present: GRH flag (if non-zero, GRH will be included) + * @ip_version: if non-zero, IP header, V4 or V6, will be included + * @udp_present :if non-zero, UDP header will be included * @immediate_present: specify if immediate data is present * @header:Structure to initialize */ -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header) +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header) { + size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0; + + grh_present = grh_present && !ip_version; memset(header, 0, sizeof *header); + /* + * UDP header without IP header doesn't make sense + */ + if (udp_present && ip_version != 4 && ip_version != 6) + return -EINVAL; + if (lrh_present) { - u16 packet_length = 0; + u16 packet_length; header->lrh.link_version = 0; header->lrh.link_next_header = @@ -250,18 +350,39 @@ } if (vlan_present) - header->eth.type = cpu_to_be16(ETH_P_8021Q); + header->eth.type = cpu_to_be16(ETH_P_8021Q); + + if (ip_version == 6 || grh_present) { + header->grh.ip_version = 6; + header->grh.payload_length = + cpu_to_be16((udp_bytes + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4 + /* ICRC */ + 3) & ~3); /* round up */ + header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b; + } - if (grh_present) { - header->grh.ip_version = 6; - header->grh.payload_length = - cpu_to_be16((IB_BTH_BYTES + + if (ip_version == 4) { + header->ip4.ver = 4; /* version 4 */ + header->ip4.hdr_len = 5; /* 5 words */ + header->ip4.tot_len = + cpu_to_be16(IB_IP4_BYTES + + udp_bytes + + IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes + - 4 + /* ICRC */ - 3) & ~3); /* round up */ - header->grh.next_header = 0x1b; + 4); /* ICRC */ + header->ip4.protocol = IPPROTO_UDP; } + if (udp_present && ip_version) + header->udp.length = + cpu_to_be16(IB_UDP_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; @@ -273,8 +394,11 @@ header->lrh_present = lrh_present; header->eth_present = eth_present; header->vlan_present = vlan_present; - header->grh_present = grh_present; + header->grh_present = grh_present || (ip_version == 6); + header->ipv4_present = ip_version == 4; + header->udp_present = udp_present; header->immediate_present = immediate_present; + return 0; } EXPORT_SYMBOL(ib_ud_header_init); @@ -293,35 +417,45 @@ if (header->lrh_present) { ib_pack(lrh_table, ARRAY_SIZE(lrh_table), - &header->lrh, buf + len); + &header->lrh, (char *)buf + len); len += IB_LRH_BYTES; } if (header->eth_present) { ib_pack(eth_table, ARRAY_SIZE(eth_table), - &header->eth, buf + len); + &header->eth, (char *)buf + len); len += IB_ETH_BYTES; } if (header->vlan_present) { ib_pack(vlan_table, ARRAY_SIZE(vlan_table), - &header->vlan, buf + len); + &header->vlan, (char *)buf + len); len += IB_VLAN_BYTES; } if (header->grh_present) { ib_pack(grh_table, ARRAY_SIZE(grh_table), - &header->grh, buf + len); + &header->grh, (char *)buf + len); len += IB_GRH_BYTES; } + if (header->ipv4_present) { + ib_pack(ip4_table, ARRAY_SIZE(ip4_table), + &header->ip4, (char *)buf + len); + len += IB_IP4_BYTES; + } + if (header->udp_present) { + ib_pack(udp_table, ARRAY_SIZE(udp_table), + &header->udp, (char *)buf + len); + len += IB_UDP_BYTES; + } ib_pack(bth_table, ARRAY_SIZE(bth_table), - &header->bth, buf + len); + &header->bth, (char *)buf + len); len += IB_BTH_BYTES; ib_pack(deth_table, ARRAY_SIZE(deth_table), - &header->deth, buf + len); + &header->deth, (char *)buf + len); len += IB_DETH_BYTES; if (header->immediate_present) { - memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data); + memcpy((char *)buf + len, &header->immediate_data, sizeof header->immediate_data); len += sizeof header->immediate_data; } @@ -342,11 +476,11 @@ { ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, &header->lrh); - buf += IB_LRH_BYTES; + buf = (char *)buf + IB_LRH_BYTES; if (header->lrh.link_version != 0) { - printk(KERN_WARNING "Invalid LRH.link_version %d\n", - header->lrh.link_version); + pr_warn("Invalid LRH.link_version %d\n", + header->lrh.link_version); return -EINVAL; } @@ -359,29 +493,29 @@ header->grh_present = 1; ib_unpack(grh_table, ARRAY_SIZE(grh_table), buf, &header->grh); - buf += IB_GRH_BYTES; + buf = (char *)buf + IB_GRH_BYTES; if (header->grh.ip_version != 6) { - printk(KERN_WARNING "Invalid GRH.ip_version %d\n", - header->grh.ip_version); + pr_warn("Invalid GRH.ip_version %d\n", + header->grh.ip_version); return -EINVAL; } if (header->grh.next_header != 0x1b) { - printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n", - header->grh.next_header); + pr_warn("Invalid GRH.next_header 0x%02x\n", + header->grh.next_header); return -EINVAL; } break; default: - printk(KERN_WARNING "Invalid LRH.link_next_header %d\n", - header->lrh.link_next_header); + pr_warn("Invalid LRH.link_next_header %d\n", + header->lrh.link_next_header); return -EINVAL; } ib_unpack(bth_table, ARRAY_SIZE(bth_table), buf, &header->bth); - buf += IB_BTH_BYTES; + buf = (char *)buf + IB_BTH_BYTES; switch (header->bth.opcode) { case IB_OPCODE_UD_SEND_ONLY: @@ -391,20 +525,19 @@ header->immediate_present = 1; break; default: - printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n", - header->bth.opcode); + pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); return -EINVAL; } if (header->bth.transport_header_version != 0) { - printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n", - header->bth.transport_header_version); + pr_warn("Invalid BTH.transport_header_version %d\n", + header->bth.transport_header_version); return -EINVAL; } ib_unpack(deth_table, ARRAY_SIZE(deth_table), buf, &header->deth); - buf += IB_DETH_BYTES; + buf = (char *)buf + IB_DETH_BYTES; if (header->immediate_present) memcpy(&header->immediate_data, buf, sizeof header->immediate_data); Index: sys/ofed/drivers/infiniband/core/ib_umem.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_umem.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define LINUXKPI_PARAM_PREFIX ibcore_ + +#include +#include +#include +#include +#include + +#include "uverbs.h" + +#include + +static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) +{ + struct scatterlist *sg; + struct page *page; + int i; + + if (umem->nmap > 0) + ib_dma_unmap_sg(dev, umem->sg_head.sgl, + umem->nmap, + DMA_BIDIRECTIONAL); + + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { + + page = sg_page(sg); + if (umem->writable && dirty) + set_page_dirty_lock(page); + put_page(page); + } + + sg_free_table(&umem->sg_head); + return; + +} + +/** + * ib_umem_get - Pin and DMA map userspace memory. + * + * If access flags indicate ODP memory, avoid pinning. Instead, stores + * the mm for future page fault handling in conjunction with MMU notifiers. + * + * @context: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * @dmasync: flush in-flight DMA when the memory region is written + */ +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync) +{ + struct ib_umem *umem; + struct page **page_list; + struct vm_area_struct **vma_list; + unsigned long locked; + unsigned long cur_base; + unsigned long npages; + int ret; + int i; + struct dma_attrs dma_attrs = { 0 }; + struct scatterlist *sg, *sg_list_start; + int need_release = 0; + unsigned int gup_flags = FOLL_WRITE; + + if (dmasync) + dma_attrs.flags |= DMA_ATTR_WRITE_BARRIER; + + if (!size) + return ERR_PTR(-EINVAL); + + /* + * If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) + return ERR_PTR(-EINVAL); + + if (priv_check(curthread, PRIV_VM_MLOCK) != 0) + return ERR_PTR(-EPERM); + + umem = kzalloc(sizeof *umem, GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + umem->context = context; + umem->length = size; + umem->address = addr; + umem->page_size = PAGE_SIZE; + umem->pid = get_pid(task_pid(current)); + /* + * We ask for writable memory if any of the following + * access flags are set. "Local write" and "remote write" + * obviously require write access. "Remote atomic" can do + * things like fetch and add, which will modify memory, and + * "MW bind" can change permissions by binding a window. + */ + umem->writable = !!(access & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); + + if (access & IB_ACCESS_ON_DEMAND) { + ret = ib_umem_odp_get(context, umem); + if (ret) { + kfree(umem); + return ERR_PTR(ret); + } + return umem; + } + + umem->odp_data = NULL; + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) { + kfree(umem); + return ERR_PTR(-ENOMEM); + } + + vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); + + npages = ib_umem_num_pages(umem); + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->pinned_vm; + + cur_base = addr & PAGE_MASK; + + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } + + ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); + if (ret) + goto out; + + if (!umem->writable) + gup_flags |= FOLL_FORCE; + + need_release = 1; + sg_list_start = umem->sg_head.sgl; + + while (npages) { + ret = get_user_pages(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof (struct page *)), + gup_flags, page_list, vma_list); + + if (ret < 0) + goto out; + + umem->npages += ret; + cur_base += ret * PAGE_SIZE; + npages -= ret; + + for_each_sg(sg_list_start, sg, ret, i) { + sg_set_page(sg, page_list[i], PAGE_SIZE, 0); + } + + /* preparing for next loop */ + sg_list_start = sg; + } + + umem->nmap = ib_dma_map_sg_attrs(context->device, + umem->sg_head.sgl, + umem->npages, + DMA_BIDIRECTIONAL, + &dma_attrs); + + if (umem->nmap <= 0) { + ret = -ENOMEM; + goto out; + } + + ret = 0; + +out: + if (ret < 0) { + if (need_release) + __ib_umem_release(context->device, umem, 0); + put_pid(umem->pid); + kfree(umem); + } else + current->mm->pinned_vm = locked; + + up_write(¤t->mm->mmap_sem); + if (vma_list) + free_page((unsigned long) vma_list); + free_page((unsigned long) page_list); + + return ret < 0 ? ERR_PTR(ret) : umem; +} +EXPORT_SYMBOL(ib_umem_get); + +static void ib_umem_account(struct work_struct *work) +{ + struct ib_umem *umem = container_of(work, struct ib_umem, work); + + down_write(&umem->mm->mmap_sem); + umem->mm->pinned_vm -= umem->diff; + up_write(&umem->mm->mmap_sem); + mmput(umem->mm); + kfree(umem); +} + +/** + * ib_umem_release - release memory pinned with ib_umem_get + * @umem: umem struct to release + */ +void ib_umem_release(struct ib_umem *umem) +{ + struct ib_ucontext *context = umem->context; + struct mm_struct *mm; + struct task_struct *task; + unsigned long diff; + + if (umem->odp_data) { + ib_umem_odp_release(umem); + return; + } + + __ib_umem_release(umem->context->device, umem, 1); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + diff = ib_umem_num_pages(umem); + + /* + * We may be called with the mm's mmap_sem already held. This + * can happen when a userspace munmap() is the call that drops + * the last reference to our file and calls our release + * method. If there are memory regions to destroy, we'll end + * up here and not be able to take the mmap_sem. In that case + * we defer the vm_locked accounting to the system workqueue. + */ + if (context->closing) { + if (!down_write_trylock(&mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_account); + umem->mm = mm; + umem->diff = diff; + + queue_work(ib_wq, &umem->work); + return; + } + } else + down_write(&mm->mmap_sem); + + mm->pinned_vm -= diff; + up_write(&mm->mmap_sem); + mmput(mm); +out: + kfree(umem); +} +EXPORT_SYMBOL(ib_umem_release); + +int ib_umem_page_count(struct ib_umem *umem) +{ + int shift; + int i; + int n; + struct scatterlist *sg; + + if (umem->odp_data) + return ib_umem_num_pages(umem); + + shift = ilog2(umem->page_size); + + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) + n += sg_dma_len(sg) >> shift; + + return n; +} +EXPORT_SYMBOL(ib_umem_page_count); + +/* + * Copy from the given ib_umem's pages to the given buffer. + * + * umem - the umem to copy from + * offset - offset to start copying from + * dst - destination buffer + * length - buffer length + * + * Returns 0 on success, or an error code. + */ +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) +{ + size_t end = offset + length; + int ret; + + if (offset > umem->length || length > umem->length - offset) { + pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", + offset, umem->length, end); + return -EINVAL; + } + +#ifdef __linux__ + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, + offset + ib_umem_offset(umem)); +#else + ret = 0; +#endif + if (ret < 0) + return ret; + else if (ret != length) + return -EINVAL; + else + return 0; +} +EXPORT_SYMBOL(ib_umem_copy_from); Index: sys/ofed/drivers/infiniband/core/ib_umem_odp.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_umem_odp.c @@ -0,0 +1,667 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include + +static void ib_umem_notifier_start_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + int notifiers_count = item->odp_data->notifiers_count++; + + if (notifiers_count == 0) + /* Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one + * should be waiting right now. */ + reinit_completion(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +static void ib_umem_notifier_end_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + /* + * This sequence increase will notify the QP page fault that + * the page that is going to be mapped in the spte could have + * been freed. + */ + ++item->odp_data->notifiers_seq; + if (--item->odp_data->notifiers_count == 0) + complete_all(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +/* Account for a new mmu notifier in an ib_ucontext. */ +static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +{ + atomic_inc(&context->notifier_count); +} + +/* Account for a terminating mmu notifier in an ib_ucontext. + * + * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since + * the function takes the semaphore itself. */ +static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +{ + int zero_notifiers = atomic_dec_and_test(&context->notifier_count); + + if (zero_notifiers && + !list_empty(&context->no_private_counters)) { + /* No currently running mmu notifiers. Now is the chance to + * add private accounting to all previously added umems. */ + struct ib_umem_odp *odp_data, *next; + + /* Prevent concurrent mmu notifiers from working on the + * no_private_counters list. */ + down_write(&context->umem_rwsem); + + /* Read the notifier_count again, with the umem_rwsem + * semaphore taken for write. */ + if (!atomic_read(&context->notifier_count)) { + list_for_each_entry_safe(odp_data, next, + &context->no_private_counters, + no_private_counters) { + mutex_lock(&odp_data->umem_mutex); + odp_data->mn_counters_active = true; + list_del(&odp_data->no_private_counters); + complete_all(&odp_data->notifier_completion); + mutex_unlock(&odp_data->umem_mutex); + } + } + + up_write(&context->umem_rwsem); + } +} + +static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) { + /* + * Increase the number of notifiers running, to + * prevent any further fault handling on this MR. + */ + ib_umem_notifier_start_account(item); + item->odp_data->dying = 1; + /* Make sure that the fact the umem is dying is out before we release + * all pending page faults. */ + smp_wmb(); + complete_all(&item->odp_data->notifier_completion); + item->context->invalidate_range(item, ib_umem_start(item), + ib_umem_end(item)); + return 0; +} + +static void ib_umem_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, + ULLONG_MAX, + ib_umem_notifier_release_trampoline, + NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_page_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, start + PAGE_SIZE); + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, address, + address + PAGE_SIZE, + invalidate_page_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, end); + return 0; +} + +static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_start_trampoline, NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_end_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static const struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_page = ib_umem_notifier_invalidate_page, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, +}; + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) +{ + int ret_val; + pid_t our_pid; + struct mm_struct *mm = get_task_mm(current); + + if (!mm) + return -EINVAL; + + /* Prevent creating ODP MRs in child processes */ + rcu_read_lock(); + our_pid = get_pid(task_pid_group_leader(current)); + rcu_read_unlock(); + put_pid(our_pid); + if (context->tgid != our_pid) { + ret_val = -EINVAL; + goto out_mm; + } + + umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); + if (!umem->odp_data) { + ret_val = -ENOMEM; + goto out_mm; + } + umem->odp_data->umem = umem; + + mutex_init(&umem->odp_data->umem_mutex); + + init_completion(&umem->odp_data->notifier_completion); + + umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->page_list)); + if (!umem->odp_data->page_list) { + ret_val = -ENOMEM; + goto out_odp_data; + } + + umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->dma_list)); + if (!umem->odp_data->dma_list) { + ret_val = -ENOMEM; + goto out_page_list; + } + + /* + * When using MMU notifiers, we will get a + * notification before the "current" task (and MM) is + * destroyed. We use the umem_rwsem semaphore to synchronize. + */ + down_write(&context->umem_rwsem); + context->odp_mrs_count++; + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem->odp_data->interval_tree, + &context->umem_tree); + if (likely(!atomic_read(&context->notifier_count)) || + context->odp_mrs_count == 1) + umem->odp_data->mn_counters_active = true; + else + list_add(&umem->odp_data->no_private_counters, + &context->no_private_counters); + downgrade_write(&context->umem_rwsem); + + if (context->odp_mrs_count == 1) { + /* + * Note that at this point, no MMU notifier is running + * for this context! + */ + atomic_set(&context->notifier_count, 0); + INIT_HLIST_NODE(&context->mn.hlist); + context->mn.ops = &ib_umem_notifiers; + /* + * Lock-dep detects a false positive for mmap_sem vs. + * umem_rwsem, due to not grasping downgrade_write correctly. + */ + ret_val = mmu_notifier_register(&context->mn, mm); + if (ret_val) { + pr_err("Failed to register mmu_notifier %d\n", ret_val); + ret_val = -EBUSY; + goto out_mutex; + } + } + + up_read(&context->umem_rwsem); + + /* + * Note that doing an mmput can cause a notifier for the relevant mm. + * If the notifier is called while we hold the umem_rwsem, this will + * cause a deadlock. Therefore, we release the reference only after we + * released the semaphore. + */ + mmput(mm); + return 0; + +out_mutex: + up_read(&context->umem_rwsem); + vfree(umem->odp_data->dma_list); +out_page_list: + vfree(umem->odp_data->page_list); +out_odp_data: + kfree(umem->odp_data); +out_mm: + mmput(mm); + return ret_val; +} + +void ib_umem_odp_release(struct ib_umem *umem) +{ + struct ib_ucontext *context = umem->context; + + /* + * Ensure that no more pages are mapped in the umem. + * + * It is the driver's responsibility to ensure, before calling us, + * that the hardware will not attempt to access the MR any more. + */ + ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_end(umem)); + + down_write(&context->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem->odp_data->interval_tree, + &context->umem_tree); + context->odp_mrs_count--; + if (!umem->odp_data->mn_counters_active) { + list_del(&umem->odp_data->no_private_counters); + complete_all(&umem->odp_data->notifier_completion); + } + + /* + * Downgrade the lock to a read lock. This ensures that the notifiers + * (who lock the mutex for reading) will be able to finish, and we + * will be able to enventually obtain the mmu notifiers SRCU. Note + * that since we are doing it atomically, no other user could register + * and unregister while we do the check. + */ + downgrade_write(&context->umem_rwsem); + if (!context->odp_mrs_count) { + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(context->tgid, + PIDTYPE_PID); + if (owning_process == NULL) + /* + * The process is already dead, notifier were removed + * already. + */ + goto out; + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) + /* + * The process' mm is already dead, notifier were + * removed already. + */ + goto out_put_task; + mmu_notifier_unregister(&context->mn, owning_mm); + + mmput(owning_mm); + +out_put_task: + put_task_struct(owning_process); + } +out: + up_read(&context->umem_rwsem); + + vfree(umem->odp_data->dma_list); + vfree(umem->odp_data->page_list); + kfree(umem->odp_data); + kfree(umem); +} + +/* + * Map for DMA and insert a single page into the on-demand paging page tables. + * + * @umem: the umem to insert the page to. + * @page_index: index in the umem to add the page to. + * @page: the page struct to map and add. + * @access_mask: access permissions needed for this page. + * @current_seq: sequence number for synchronization with invalidations. + * the sequence number is taken from + * umem->odp_data->notifiers_seq. + * + * The function returns -EFAULT if the DMA mapping operation fails. It returns + * -EAGAIN if a concurrent invalidation prevents us from updating the page. + * + * The page is released via put_page even if the operation failed. For + * on-demand pinning, the page is released whenever it isn't stored in the + * umem. + */ +static int ib_umem_odp_map_dma_single_page( + struct ib_umem *umem, + int page_index, + u64 base_virt_addr, + struct page *page, + u64 access_mask, + unsigned long current_seq) +{ + struct ib_device *dev = umem->context->device; + dma_addr_t dma_addr; + int stored_page = 0; + int remove_existing_mapping = 0; + int ret = 0; + + /* + * Note: we avoid writing if seq is different from the initial seq, to + * handle case of a racing notifier. This check also allows us to bail + * early if we have a notifier running in parallel with us. + */ + if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + ret = -EAGAIN; + goto out; + } + if (!(umem->odp_data->dma_list[page_index])) { + dma_addr = ib_dma_map_page(dev, + page, + 0, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, dma_addr)) { + ret = -EFAULT; + goto out; + } + umem->odp_data->dma_list[page_index] = dma_addr | access_mask; + umem->odp_data->page_list[page_index] = page; + stored_page = 1; + } else if (umem->odp_data->page_list[page_index] == page) { + umem->odp_data->dma_list[page_index] |= access_mask; + } else { + pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", + umem->odp_data->page_list[page_index], page); + /* Better remove the mapping now, to prevent any further + * damage. */ + remove_existing_mapping = 1; + } + +out: + /* On Demand Paging - avoid pinning the page */ + if (umem->context->invalidate_range || !stored_page) + put_page(page); + + if (remove_existing_mapping && umem->context->invalidate_range) { + invalidate_page_trampoline( + umem, + base_virt_addr + (page_index * PAGE_SIZE), + base_virt_addr + ((page_index+1)*PAGE_SIZE), + NULL); + ret = -EAGAIN; + } + + return ret; +} + +/** + * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. + * + * Pins the range of pages passed in the argument, and maps them to + * DMA addresses. The DMA addresses of the mapped pages is updated in + * umem->odp_data->dma_list. + * + * Returns the number of pages mapped in success, negative error code + * for failure. + * An -EAGAIN error code is returned when a concurrent mmu notifier prevents + * the function from completing its task. + * + * @umem: the umem to map and pin + * @user_virt: the address from which we need to map. + * @bcnt: the minimal number of bytes to pin and map. The mapping might be + * bigger due to alignment, and may also be smaller in case of an error + * pinning or mapping a page. The actual pages mapped is returned in + * the return value. + * @access_mask: bit mask of the requested access permissions for the given + * range. + * @current_seq: the MMU notifiers sequance value for synchronization with + * invalidations. the sequance number is read from + * umem->odp_data->notifiers_seq before calling this function + */ +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, + u64 access_mask, unsigned long current_seq) +{ + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + struct page **local_page_list = NULL; + u64 off; + int j, k, ret = 0, start_idx, npages = 0; + u64 base_virt_addr; + unsigned int flags = 0; + + if (access_mask == 0) + return -EINVAL; + + if (user_virt < ib_umem_start(umem) || + user_virt + bcnt > ib_umem_end(umem)) + return -EFAULT; + + local_page_list = (struct page **)__get_free_page(GFP_KERNEL); + if (!local_page_list) + return -ENOMEM; + + off = user_virt & (~PAGE_MASK); + user_virt = user_virt & PAGE_MASK; + base_virt_addr = user_virt; + bcnt += off; /* Charge for the first page offset as well. */ + + owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); + if (owning_process == NULL) { + ret = -EINVAL; + goto out_no_task; + } + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) { + ret = -EINVAL; + goto out_put_task; + } + + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; + k = start_idx; + + while (bcnt > 0) { + const size_t gup_num_pages = + min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, + PAGE_SIZE / sizeof(struct page *)); + + down_read(&owning_mm->mmap_sem); + /* + * Note: this might result in redundent page getting. We can + * avoid this by checking dma_list to be 0 before calling + * get_user_pages. However, this make the code much more + * complex (and doesn't gain us much performance in most use + * cases). + */ + npages = get_user_pages_remote(owning_process, owning_mm, + user_virt, gup_num_pages, + flags, local_page_list, NULL); + up_read(&owning_mm->mmap_sem); + + if (npages < 0) + break; + + bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); + user_virt += npages << PAGE_SHIFT; + mutex_lock(&umem->odp_data->umem_mutex); + for (j = 0; j < npages; ++j) { + ret = ib_umem_odp_map_dma_single_page( + umem, k, base_virt_addr, local_page_list[j], + access_mask, current_seq); + if (ret < 0) + break; + k++; + } + mutex_unlock(&umem->odp_data->umem_mutex); + + if (ret < 0) { + /* Release left over pages when handling errors. */ + for (++j; j < npages; ++j) + put_page(local_page_list[j]); + break; + } + } + + if (ret >= 0) { + if (npages < 0 && k == start_idx) + ret = npages; + else + ret = k - start_idx; + } + + mmput(owning_mm); +out_put_task: + put_task_struct(owning_process); +out_no_task: + free_page((unsigned long)local_page_list); + return ret; +} +EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, + u64 bound) +{ + int idx; + u64 addr; + struct ib_device *dev = umem->context->device; + + virt = max_t(u64, virt, ib_umem_start(umem)); + bound = min_t(u64, bound, ib_umem_end(umem)); + /* Note that during the run of this function, the + * notifiers_count of the MR is > 0, preventing any racing + * faults from completion. We might be racing with other + * invalidations, so we must make sure we free each page only + * once. */ + mutex_lock(&umem->odp_data->umem_mutex); + for (addr = virt; addr < bound; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + if (umem->odp_data->page_list[idx]) { + struct page *page = umem->odp_data->page_list[idx]; + dma_addr_t dma = umem->odp_data->dma_list[idx]; + dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; + + WARN_ON(!dma_addr); + + ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma & ODP_WRITE_ALLOWED_BIT) { + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); + } + /* on demand pinning support */ + if (!umem->context->invalidate_range) + put_page(page); + umem->odp_data->page_list[idx] = NULL; + umem->odp_data->dma_list[idx] = 0; + } + } + mutex_unlock(&umem->odp_data->umem_mutex); +} +EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); Index: sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +/* + * The ib_umem list keeps track of memory regions for which the HW + * device request to receive notification when the related memory + * mapping is changed. + * + * ib_umem_lock protects the list. + */ + +static inline u64 node_start(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_start(umem_odp->umem); +} + +/* Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval, while the + * function ib_umem_end returns the first address which is not contained + * in the umem. + */ +static inline u64 node_last(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_end(umem_odp->umem) - 1; +} + +INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, + node_start, node_last, , rbt_ib_umem) + +/* @last is not a part of the interval. See comment for function + * node_last. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, + u64 start, u64 last, + umem_call_back cb, + void *cookie) +{ + int ret_val = 0; + struct umem_odp_node *node; + struct ib_umem_odp *umem; + + if (unlikely(start == last)) + return ret_val; + + for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; + node = rbt_ib_umem_iter_next(node, start, last - 1)) { + umem = container_of(node, struct ib_umem_odp, interval_tree); + ret_val = cb(umem->umem, start, last, cookie) || ret_val; + } + + return ret_val; +} Index: sys/ofed/drivers/infiniband/core/ib_user_mad.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_user_mad.c +++ sys/ofed/drivers/infiniband/core/ib_user_mad.c @@ -33,6 +33,8 @@ * SOFTWARE. */ +#define pr_fmt(fmt) "user_mad: " fmt + #include #include #include @@ -79,10 +81,10 @@ */ struct ib_umad_port { - struct cdev *cdev; + struct cdev cdev; struct device *dev; - struct cdev *sm_cdev; + struct cdev sm_cdev; struct device *sm_dev; struct semaphore sm_sem; @@ -93,19 +95,16 @@ struct ib_umad_device *umad_dev; int dev_num; u8 port_num; - struct list_head port_lst; }; struct ib_umad_device { - int start_port, end_port; - struct kref ref; + struct kobject kobj; struct ib_umad_port port[0]; }; struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; - struct file *filp; struct list_head recv_list; struct list_head send_list; struct list_head port_list; @@ -131,85 +130,21 @@ static DEFINE_SPINLOCK(port_lock); static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); -static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); -static void ib_umad_remove_one(struct ib_device *device); - -static DEFINE_SPINLOCK(ports_list_lock); -static struct list_head ports_list; - - -static void remove_ports(struct kref *ref) -{ - int i; - struct ib_umad_port *p, *p1; - struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); - - for (i = 0; i <= dev->end_port - dev->start_port; ++i) { - struct ib_umad_port *port = &dev->port[i]; - - list_for_each_entry_safe(p, p1, &ports_list, port_lst) - if (p == port) { - list_del(&p->port_lst); - break; - } - } -} +static void ib_umad_remove_one(struct ib_device *device, void *client_data); -static void put_umad_dev(struct kref *ref) +static void ib_umad_release_dev(struct kobject *kobj) { - int ret, i; struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); + container_of(kobj, struct ib_umad_device, kobj); - spin_lock(&ports_list_lock); - ret = (kref_put(ref, remove_ports)); - spin_unlock(&ports_list_lock); - if (ret) { - for (i = 0; i <= dev->end_port - dev->start_port; ++i) { - if (dev->port[i].dev_num < IB_UMAD_MAX_PORTS) - clear_bit(dev->port[i].dev_num, dev_map); - else - clear_bit(dev->port[i].dev_num - IB_UMAD_MAX_PORTS, overflow_map); - cdev_del(dev->port[i].cdev); - cdev_del(dev->port[i].sm_cdev); - } kfree(dev); - } -} - -static void release_port(struct ib_umad_port *port) -{ - put_umad_dev(&port->umad_dev->ref); -} - - -static struct ib_umad_port *get_port(struct cdev *cdev) -{ - struct ib_umad_port *port; - - spin_lock(&ports_list_lock); - list_for_each_entry(port, &ports_list, port_lst) { - if (port->cdev == cdev || port->sm_cdev == cdev) { - kref_get(&port->umad_dev->ref); - spin_unlock(&ports_list_lock); - - return port; - } - } - spin_unlock(&ports_list_lock); - - return NULL; } -static void insert_port(struct ib_umad_port *port) -{ - spin_lock(&ports_list_lock); - list_add(&port->port_lst, &ports_list); - spin_unlock(&ports_list_lock); -} +static struct kobj_type ib_umad_dev_ktype = { + .release = ib_umad_release_dev, +}; static int hdr_size(struct ib_umad_file *file) { @@ -236,7 +171,6 @@ packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { list_add_tail(&packet->list, &file->recv_list); - selwakeup(&file->filp->f_selinfo); wake_up_interruptible(&file->recv_wait); ret = 0; break; @@ -275,6 +209,7 @@ } static void recv_handler(struct ib_mad_agent *agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_umad_file *file = agent->context; @@ -327,20 +262,23 @@ { struct ib_mad_recv_buf *recv_buf; int left, seg_payload, offset, max_seg_payload; + size_t seg_size; - /* We need enough room to copy the first (or only) MAD segment. */ recv_buf = &packet->recv_wc->recv_buf; - if ((packet->length <= sizeof (*recv_buf->mad) && + seg_size = packet->recv_wc->mad_seg_size; + + /* We need enough room to copy the first (or only) MAD segment. */ + if ((packet->length <= seg_size && count < hdr_size(file) + packet->length) || - (packet->length > sizeof (*recv_buf->mad) && - count < hdr_size(file) + sizeof (*recv_buf->mad))) + (packet->length > seg_size && + count < hdr_size(file) + seg_size)) return -EINVAL; if (copy_to_user(buf, &packet->mad, hdr_size(file))) return -EFAULT; buf += hdr_size(file); - seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad)); + seg_payload = min_t(int, packet->length, seg_size); if (copy_to_user(buf, recv_buf->mad, seg_payload)) return -EFAULT; @@ -357,14 +295,14 @@ return -ENOSPC; } offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class); - max_seg_payload = sizeof (struct ib_mad) - offset; + max_seg_payload = seg_size - offset; for (left = packet->length - seg_payload, buf += seg_payload; left; left -= seg_payload, buf += seg_payload) { recv_buf = container_of(recv_buf->list.next, struct ib_mad_recv_buf, list); seg_payload = min(left, max_seg_payload); - if (copy_to_user(buf, ((void *) recv_buf->mad) + offset, + if (copy_to_user(buf, (char *)recv_buf->mad + offset, seg_payload)) return -EFAULT; } @@ -445,7 +383,7 @@ /* Copy class specific header */ if ((msg->hdr_len > IB_MGMT_RMPP_HDR) && - copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, + copy_from_user((char *)msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, msg->hdr_len - IB_MGMT_RMPP_HDR)) return -EFAULT; @@ -490,11 +428,11 @@ * the same TID, reject the second as a duplicate. This is more * restrictive than required by the spec. */ - if (!ib_response_mad((struct ib_mad *) hdr)) { - if (!ib_response_mad((struct ib_mad *) sent_hdr)) + if (!ib_response_mad(hdr)) { + if (!ib_response_mad(sent_hdr)) return 1; continue; - } else if (!ib_response_mad((struct ib_mad *) sent_hdr)) + } else if (!ib_response_mad(sent_hdr)) continue; if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) @@ -515,6 +453,7 @@ struct ib_rmpp_mad *rmpp_mad; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; + u8 base_version; if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) return -EINVAL; @@ -557,8 +496,8 @@ ah_attr.ah_flags = IB_AH_GRH; memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16); ah_attr.grh.sgid_index = packet->mad.hdr.gid_index; - ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); - ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; + ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); + ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class; } @@ -570,35 +509,39 @@ rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); - if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) { - copy_offset = IB_MGMT_MAD_HDR; - rmpp_active = 0; - } else { + + if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && ib_mad_kernel_rmpp_agent(agent)) { copy_offset = IB_MGMT_RMPP_HDR; rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & - IB_MGMT_RMPP_FLAG_ACTIVE; + IB_MGMT_RMPP_FLAG_ACTIVE; + } else { + copy_offset = IB_MGMT_MAD_HDR; + rmpp_active = 0; } + base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; data_len = count - hdr_size(file) - hdr_len; packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), packet->mad.hdr.pkey_index, rmpp_active, - hdr_len, data_len, GFP_KERNEL); + hdr_len, data_len, GFP_KERNEL, + base_version); if (IS_ERR(packet->msg)) { ret = PTR_ERR(packet->msg); goto err_ah; } - packet->msg->ah = ah; + packet->msg->ah = ah; packet->msg->timeout_ms = packet->mad.hdr.timeout_ms; - packet->msg->retries = packet->mad.hdr.retries; + packet->msg->retries = packet->mad.hdr.retries; packet->msg->context[0] = packet; /* Copy MAD header. Any RMPP header is already in place. */ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR); if (!rmpp_active) { - if (copy_from_user(packet->msg->mad + copy_offset, + if (copy_from_user((char *)packet->msg->mad + copy_offset, buf + copy_offset, hdr_len + data_len - copy_offset)) { ret = -EFAULT; @@ -622,14 +565,22 @@ rmpp_mad->mad_hdr.tid = *tid; } - spin_lock_irq(&file->send_lock); - ret = is_duplicate(file, packet); - if (!ret) + if (!ib_mad_kernel_rmpp_agent(agent) + && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + spin_lock_irq(&file->send_lock); list_add_tail(&packet->list, &file->send_list); - spin_unlock_irq(&file->send_lock); - if (ret) { - ret = -EINVAL; - goto err_msg; + spin_unlock_irq(&file->send_lock); + } else { + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; + } } ret = ib_post_send_mad(packet->msg, NULL); @@ -680,6 +631,8 @@ mutex_lock(&file->mutex); if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid device\n"); ret = -EPIPE; goto out; } @@ -690,6 +643,9 @@ } if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid QPN %d specified\n", + ureq.qpn); ret = -EINVAL; goto out; } @@ -698,11 +654,15 @@ if (!__get_agent(file, agent_id)) goto found; + dev_notice(file->port->dev, + "ib_umad_reg_agent: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; found: if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; memcpy(req.oui, ureq.oui, sizeof req.oui); @@ -723,7 +683,7 @@ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, ureq.mgmt_class ? &req : NULL, ureq.rmpp_version, - send_handler, recv_handler, file); + send_handler, recv_handler, file, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); agent = NULL; @@ -731,7 +691,7 @@ } if (put_user(agent_id, - (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) { + (u32 __user *) ((char *)arg + offsetof(struct ib_user_mad_reg_req, id)))) { ret = -EFAULT; goto out; } @@ -739,10 +699,11 @@ if (!file->already_used) { file->already_used = 1; if (!file->use_pkey_index) { - printk(KERN_WARNING "user_mad: process %s did not enable " - "P_Key index support.\n", curthread->td_proc->p_comm); - printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " - "has info on the new ABI.\n"); + dev_warn(file->port->dev, + "process %s did not enable P_Key index support.\n", + current->comm); + dev_warn(file->port->dev, + " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); } } @@ -760,6 +721,120 @@ return ret; } +static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) +{ + struct ib_user_mad_reg_req2 ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid device\n"); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof(ureq))) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid QPN %d specified\n", + ureq.qpn); + ret = -EINVAL; + goto out; + } + + if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { + const u32 flags = IB_USER_MAD_REG_FLAGS_CAP; + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n", + ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + ret = -EINVAL; + + if (put_user(flags, + (u32 __user *) ((char *)arg + offsetof(struct + ib_user_mad_reg_req2, flags)))) + ret = -EFAULT; + + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(file->port->dev, + "ib_umad_reg_agent2: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + if (ureq.oui & 0xff000000) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n", + ureq.oui); + ret = -EINVAL; + goto out; + } + req.oui[2] = ureq.oui & 0x0000ff; + req.oui[1] = (ureq.oui & 0x00ff00) >> 8; + req.oui[0] = (ureq.oui & 0xff0000) >> 16; + memcpy(req.method_mask, ureq.method_mask, + sizeof(req.method_mask)); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, + ureq.flags); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *)((char *)arg + + offsetof(struct ib_user_mad_reg_req2, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + file->use_pkey_index = 1; + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + + static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) { struct ib_mad_agent *agent = NULL; @@ -815,6 +890,8 @@ return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, (void __user *) arg); default: return -ENOIOCTLCMD; } @@ -831,6 +908,8 @@ return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg)); default: return -ENOIOCTLCMD; } @@ -850,26 +929,19 @@ { struct ib_umad_port *port; struct ib_umad_file *file; - int ret; + int ret = -ENXIO; - port = get_port(inode->i_cdev->si_drv1); - if (!port) - return -ENXIO; + port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, cdev); mutex_lock(&port->file_mutex); - if (!port->ib_dev) { - release_port(port); - ret = -ENXIO; + if (!port->ib_dev) goto out; - } + ret = -ENOMEM; file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) { - release_port(port); - ret = -ENOMEM; + if (!file) goto out; - } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); @@ -878,12 +950,18 @@ init_waitqueue_head(&file->recv_wait); file->port = port; - file->filp = filp; filp->private_data = file; list_add_tail(&file->port_list, &port->file_list); ret = nonseekable_open(inode, filp); + if (ret) { + list_del(&file->port_list); + kfree(file); + goto out; + } + + kobject_get(&port->umad_dev->kobj); out: mutex_unlock(&port->file_mutex); @@ -893,7 +971,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; - struct ib_umad_port *port = file->port; + struct ib_umad_device *dev = file->port->umad_dev; struct ib_umad_packet *packet, *tmp; int already_dead; int i; @@ -922,21 +1000,21 @@ mutex_unlock(&file->port->file_mutex); kfree(file); - release_port(port); + kobject_put(&dev->kobj); return 0; } static const struct file_operations umad_fops = { - .owner = THIS_MODULE, - .read = ib_umad_read, - .write = ib_umad_write, - .poll = ib_umad_poll, + .owner = THIS_MODULE, + .read = ib_umad_read, + .write = ib_umad_write, + .poll = ib_umad_poll, .unlocked_ioctl = ib_umad_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = ib_umad_compat_ioctl, + .compat_ioctl = ib_umad_compat_ioctl, #endif - .open = ib_umad_open, + .open = ib_umad_open, .release = ib_umad_close, .llseek = no_llseek, }; @@ -949,9 +1027,7 @@ }; int ret; - port = get_port(inode->i_cdev->si_drv1); - if (!port) - return -ENXIO; + port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, sm_cdev); if (filp->f_flags & O_NONBLOCK) { if (down_trylock(&port->sm_sem)) { @@ -966,17 +1042,27 @@ } ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); - if (ret) { - up(&port->sm_sem); - goto fail; - } + if (ret) + goto err_up_sem; filp->private_data = port; - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret) + goto err_clr_sm_cap; + + kobject_get(&port->umad_dev->kobj); + + return 0; + +err_clr_sm_cap: + swap(props.set_port_cap_mask, props.clr_port_cap_mask); + ib_modify_port(port->ib_dev, port->port_num, 0, &props); + +err_up_sem: + up(&port->sm_sem); fail: - release_port(port); return ret; } @@ -995,14 +1081,14 @@ up(&port->sm_sem); - release_port(port); + kobject_put(&port->umad_dev->kobj); return ret; } static const struct file_operations umad_sm_fops = { - .owner = THIS_MODULE, - .open = ib_umad_sm_open, + .owner = THIS_MODULE, + .open = ib_umad_sm_open, .release = ib_umad_sm_close, .llseek = no_llseek, }; @@ -1037,14 +1123,12 @@ } static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); -static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION); -} -static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_MAD_ABI_VERSION)); static dev_t overflow_maj; -static int find_overflow_devnum(void) +static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); +static int find_overflow_devnum(struct ib_device *device) { int ret; @@ -1052,7 +1136,8 @@ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register dynamic device number\n"); + dev_err(&device->dev, + "couldn't register dynamic device number\n"); return ret; } } @@ -1065,6 +1150,7 @@ } static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; @@ -1074,9 +1160,9 @@ devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); if (devnum >= IB_UMAD_MAX_PORTS) { spin_unlock(&port_lock); - devnum = find_overflow_devnum(); + devnum = find_overflow_devnum(device); if (devnum < 0) - return -1; + return -1; spin_lock(&port_lock); port->dev_num = devnum + IB_UMAD_MAX_PORTS; @@ -1095,18 +1181,15 @@ mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); - port->cdev = cdev_alloc(); - if (!port->cdev) - goto err_cdev_c; - - port->cdev->ops = &umad_fops; - port->cdev->owner = THIS_MODULE; - kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num); - if (cdev_add(port->cdev, base, 1)) + cdev_init(&port->cdev, &umad_fops); + port->cdev.owner = THIS_MODULE; + port->cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); + if (cdev_add(&port->cdev, base, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dma_device, - port->cdev->dev, port, + port->cdev.dev, port, "umad%d", port->dev_num); if (IS_ERR(port->dev)) goto err_cdev; @@ -1117,18 +1200,15 @@ goto err_dev; base += IB_UMAD_MAX_PORTS; - port->sm_cdev = cdev_alloc(); - if (!port->sm_cdev) - goto err_dev; - - port->sm_cdev->ops = &umad_sm_fops; - port->sm_cdev->owner = THIS_MODULE; - kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num); - if (cdev_add(port->sm_cdev, base, 1)) + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; + port->sm_cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); + if (cdev_add(&port->sm_cdev, base, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dma_device, - port->sm_cdev->dev, port, + port->sm_cdev.dev, port, "issm%d", port->dev_num); if (IS_ERR(port->sm_dev)) goto err_sm_cdev; @@ -1141,17 +1221,16 @@ return 0; err_sm_dev: - device_destroy(umad_class, port->sm_cdev->dev); + device_destroy(umad_class, port->sm_cdev.dev); err_sm_cdev: - cdev_del(port->sm_cdev); + cdev_del(&port->sm_cdev); err_dev: - device_destroy(umad_class, port->cdev->dev); + device_destroy(umad_class, port->cdev.dev); err_cdev: - cdev_del(port->cdev); -err_cdev_c: + cdev_del(&port->cdev); if (port->dev_num < IB_UMAD_MAX_PORTS) clear_bit(devnum, dev_map); else @@ -1168,8 +1247,11 @@ dev_set_drvdata(port->dev, NULL); dev_set_drvdata(port->sm_dev, NULL); - device_destroy(umad_class, port->cdev->dev); - device_destroy(umad_class, port->sm_cdev->dev); + device_destroy(umad_class, port->cdev.dev); + device_destroy(umad_class, port->sm_cdev.dev); + + cdev_del(&port->cdev); + cdev_del(&port->sm_cdev); mutex_lock(&port->file_mutex); @@ -1186,22 +1268,21 @@ } mutex_unlock(&port->file_mutex); + + if (port->dev_num < IB_UMAD_MAX_PORTS) + clear_bit(port->dev_num, dev_map); + else + clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); } static void ib_umad_add_one(struct ib_device *device) { struct ib_umad_device *umad_dev; int s, e, i; + int count = 0; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - - if (device->node_type == RDMA_NODE_IB_SWITCH) - s = e = 0; - else { - s = 1; - e = device->phys_port_cnt; - } + s = rdma_start_port(device); + e = rdma_end_port(device); umad_dev = kzalloc(sizeof *umad_dev + (e - s + 1) * sizeof (struct ib_umad_port), @@ -1209,44 +1290,53 @@ if (!umad_dev) return; - kref_init(&umad_dev->ref); - - umad_dev->start_port = s; - umad_dev->end_port = e; - - for (i = 0; i <= e - s; ++i) - insert_port(&umad_dev->port[i]); + kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); for (i = s; i <= e; ++i) { + if (!rdma_cap_ib_mad(device, i)) + continue; + umad_dev->port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) - goto err; + if (ib_umad_init_port(device, i, umad_dev, + &umad_dev->port[i - s])) + goto err; + + count++; } + if (!count) + goto free; + ib_set_client_data(device, &umad_client, umad_dev); return; err: - while (--i >= s) - ib_umad_kill_port(&umad_dev->port[i - s]); + while (--i >= s) { + if (!rdma_cap_ib_mad(device, i)) + continue; - put_umad_dev(&umad_dev->ref); + ib_umad_kill_port(&umad_dev->port[i - s]); + } +free: + kobject_put(&umad_dev->kobj); } -static void ib_umad_remove_one(struct ib_device *device) +static void ib_umad_remove_one(struct ib_device *device, void *client_data) { - struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client); + struct ib_umad_device *umad_dev = client_data; int i; if (!umad_dev) return; - for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) + for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) { + if (rdma_cap_ib_mad(device, i + rdma_start_port(device))) ib_umad_kill_port(&umad_dev->port[i]); + } - put_umad_dev(&umad_dev->ref); + kobject_put(&umad_dev->kobj); } static char *umad_devnode(struct device *dev, umode_t *mode) @@ -1258,33 +1348,31 @@ { int ret; - INIT_LIST_HEAD(&ports_list); - ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register device number\n"); + pr_err("couldn't register device number\n"); goto out; } umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); - printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n"); + pr_err("couldn't create class infiniband_mad\n"); goto out_chrdev; } umad_class->devnode = umad_devnode; - ret = class_create_file(umad_class, &class_attr_abi_version); + ret = class_create_file(umad_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n"); + pr_err("couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&umad_client); if (ret) { - printk(KERN_ERR "user_mad: couldn't register ib_umad client\n"); + pr_err("couldn't register ib_umad client\n"); goto out_class; } @@ -1309,5 +1397,5 @@ unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); } -module_init(ib_umad_init); +module_init_order(ib_umad_init, SI_ORDER_THIRD); module_exit(ib_umad_cleanup); Index: sys/ofed/drivers/infiniband/core/ib_uverbs_cmd.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_uverbs_cmd.c +++ sys/ofed/drivers/infiniband/core/ib_uverbs_cmd.c @@ -38,25 +38,17 @@ #include #include #include -#include +#include #include -#include -#include #include -#include -#include #include "uverbs.h" +#include "core_priv.h" -static int disable_raw_qp_enforcement; -module_param_named(disable_raw_qp_enforcement, disable_raw_qp_enforcement, int, - 0444); -MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for " - "being opened by root (default: 0)"); +#include struct uverbs_lock_class { - struct lock_class_key key; char name[16]; }; @@ -68,44 +60,19 @@ static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; -static struct uverbs_lock_class dct_lock_class = { .name = "DCT-uobj" }; - -static int uverbs_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) -{ - return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; -} - -static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len) -{ - return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; -} - -static struct ib_udata_ops uverbs_copy = { - .copy_from = uverbs_copy_from_udata, - .copy_to = uverbs_copy_to_udata -}; - -#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->ops = &uverbs_copy; \ - (udata)->inbuf = (void __user *) (ibuf); \ - (udata)->outbuf = (void __user *) (obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) - -enum uverbs_cmd_type { - IB_USER_VERBS_CMD_BASIC, - IB_USER_VERBS_CMD_EXTENDED -}; +static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; +static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" }; +static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" }; /* * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it - * needs to be held during all idr operations. When an object is + * needs to be held during all idr write operations. When an object is * looked up, a reference must be taken on the object's kref before - * dropping this lock. + * dropping this lock. For read operations, the rcu_read_lock() + * and rcu_write_lock() but similarly the kref reference is grabbed + * before the rcu_read_unlock(). * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. @@ -131,13 +98,12 @@ uobj->context = context; kref_init(&uobj->ref); init_rwsem(&uobj->mutex); - lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name); uobj->live = 0; } static void release_uobj(struct kref *kref) { - kfree(container_of(kref, struct ib_uobject, ref)); + kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); } static void put_uobj(struct ib_uobject *uobj) @@ -161,18 +127,17 @@ { int ret; -retry: - if (!idr_pre_get(idr, GFP_KERNEL)) - return -ENOMEM; - + idr_preload(GFP_KERNEL); spin_lock(&ib_uverbs_idr_lock); - ret = idr_get_new(idr, uobj, &uobj->id); - spin_unlock(&ib_uverbs_idr_lock); - if (ret == -EAGAIN) - goto retry; + ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT); + if (ret >= 0) + uobj->id = ret; - return ret; + spin_unlock(&ib_uverbs_idr_lock); + idr_preload_end(); + + return ret < 0 ? ret : 0; } void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) @@ -187,7 +152,7 @@ { struct ib_uobject *uobj; - spin_lock(&ib_uverbs_idr_lock); + rcu_read_lock(); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) @@ -195,7 +160,7 @@ else uobj = NULL; } - spin_unlock(&ib_uverbs_idr_lock); + rcu_read_unlock(); return uobj; } @@ -283,6 +248,27 @@ return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); } +static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0); +} + +static void put_wq_read(struct ib_wq *wq) +{ + put_uobj_read(wq->uobject); +} + +static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle, + struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0); +} + +static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table) +{ + put_uobj_read(ind_table->uobject); +} + static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) { struct ib_uobject *uobj; @@ -301,16 +287,6 @@ put_uobj_write(qp->uobject); } -static struct ib_dct *idr_read_dct(int dct_handle, struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_dct_idr, dct_handle, context, 0); -} - -static void put_dct_read(struct ib_dct *dct) -{ - put_uobj_read(dct->uobject); -} - static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) { return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0); @@ -334,13 +310,13 @@ } ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; - struct ib_device *ibdev = file->device->ib_dev; struct ib_ucontext *ucontext; struct file *filp; int ret; @@ -362,13 +338,13 @@ (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - ucontext = ibdev->alloc_ucontext(ibdev, &udata); + ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); goto err; } - ucontext->device = ibdev; + ucontext->device = ib_dev; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); INIT_LIST_HEAD(&ucontext->mw_list); @@ -376,41 +352,45 @@ INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); + INIT_LIST_HEAD(&ucontext->wq_list); + INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); - INIT_LIST_HEAD(&ucontext->dct_list); + rcu_read_lock(); + ucontext->tgid = get_pid(task_pid_group_leader(current)); + rcu_read_unlock(); ucontext->closing = 0; - ucontext->peer_mem_private_data = NULL; - ucontext->peer_mem_name = NULL; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + ucontext->umem_tree = RB_ROOT; + init_rwsem(&ucontext->umem_rwsem); + ucontext->odp_mrs_count = 0; + INIT_LIST_HEAD(&ucontext->no_private_counters); + + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + ucontext->invalidate_range = NULL; + +#endif resp.num_comp_vectors = file->device->num_comp_vectors; - ret = get_unused_fd(); + ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; resp.async_fd = ret; - filp = ib_uverbs_alloc_event_file(file, 1); + filp = ib_uverbs_alloc_event_file(file, ib_dev, 1); if (IS_ERR(filp)) { ret = PTR_ERR(filp); goto err_fd; } - file->async_file = filp->private_data; - - INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev, - ib_uverbs_event_handler); - ret = ib_register_event_handler(&file->event_handler); - if (ret) - goto err_file; - if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; goto err_file; } - kref_get(&file->async_file->ref); - kref_get(&file->ref); + file->ucontext = ucontext; fd_install(resp.async_fd, filp); @@ -420,76 +400,75 @@ return in_len; err_file: + ib_uverbs_free_async_event_file(file); fput(filp); err_fd: put_unused_fd(resp.async_fd); err_free: - ibdev->dealloc_ucontext(ucontext); + put_pid(ucontext->tgid); + ib_dev->dealloc_ucontext(ucontext); err: mutex_unlock(&file->mutex); return ret; } -static void ib_uverbs_query_device_assign( - struct ib_uverbs_query_device_resp *resp, - struct ib_device_attr *attr, - struct ib_uverbs_file *file) -{ - memset(resp, 0, sizeof(*resp)); - - resp->fw_ver = attr->fw_ver; - resp->node_guid = file->device->ib_dev->node_guid; - resp->sys_image_guid = attr->sys_image_guid; - resp->max_mr_size = attr->max_mr_size; - resp->page_size_cap = attr->page_size_cap; - resp->vendor_id = attr->vendor_id; - resp->vendor_part_id = attr->vendor_part_id; - resp->hw_ver = attr->hw_ver; - resp->max_qp = attr->max_qp; - resp->max_qp_wr = attr->max_qp_wr; - resp->device_cap_flags = attr->device_cap_flags; - resp->max_sge = attr->max_sge; - resp->max_sge_rd = attr->max_sge_rd; - resp->max_cq = attr->max_cq; - resp->max_cqe = attr->max_cqe; - resp->max_mr = attr->max_mr; - resp->max_pd = attr->max_pd; - resp->max_qp_rd_atom = attr->max_qp_rd_atom; - resp->max_ee_rd_atom = attr->max_ee_rd_atom; - resp->max_res_rd_atom = attr->max_res_rd_atom; - resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; - resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; - resp->atomic_cap = attr->atomic_cap; - resp->max_ee = attr->max_ee; - resp->max_rdd = attr->max_rdd; - resp->max_mw = attr->max_mw; - resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; - resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; - resp->max_mcast_grp = attr->max_mcast_grp; - resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; - resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; - resp->max_ah = attr->max_ah; - resp->max_fmr = attr->max_fmr; - resp->max_map_per_fmr = attr->max_map_per_fmr; - resp->max_srq = attr->max_srq; - resp->max_srq_wr = attr->max_srq_wr; - resp->max_srq_sge = attr->max_srq_sge; - resp->max_pkeys = attr->max_pkeys; - resp->local_ca_ack_delay = attr->local_ca_ack_delay; - resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; +static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) +{ + resp->fw_ver = attr->fw_ver; + resp->node_guid = ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = (u32)(attr->device_cap_flags); + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = ib_dev->phys_port_cnt; } ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; - struct ib_device_attr attr; - int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -497,20 +476,18 @@ if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_device(file->device->ib_dev, &attr); - if (ret) - return ret; - - ib_uverbs_query_device_assign(&resp, &attr, file); + memset(&resp, 0, sizeof resp); + copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs); - if (copy_to_user((void __user *)(unsigned long) cmd.response, - &resp, sizeof(resp))) + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) return -EFAULT; return in_len; } ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -525,7 +502,7 @@ if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr); + ret = ib_query_port(ib_dev, cmd.port_num, &attr); if (ret) return ret; @@ -550,7 +527,7 @@ resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; - resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, + resp.link_layer = rdma_port_get_link_layer(ib_dev, cmd.port_num); if (copy_to_user((void __user *) (unsigned long) cmd.response, @@ -561,6 +538,7 @@ } ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -588,15 +566,15 @@ init_uobj(uobj, 0, file->ucontext, &pd_lock_class); down_write(&uobj->mutex); - pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, - file->ucontext, &udata); + pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); if (IS_ERR(pd)) { ret = PTR_ERR(pd); goto err; } - pd->device = file->device->ib_dev; + pd->device = ib_dev; pd->uobject = uobj; + pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); uobj->object = pd; @@ -635,11 +613,13 @@ } ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; struct ib_uobject *uobj; + struct ib_pd *pd; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -648,15 +628,20 @@ uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); if (!uobj) return -EINVAL; + pd = uobj->object; - ret = ib_dealloc_pd(uobj->object); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); + if (atomic_read(&pd->usecnt)) { + ret = -EBUSY; + goto err_put; + } + ret = pd->device->dealloc_pd(uobj->object); + WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); if (ret) - return ret; + goto err_put; + + uobj->live = 0; + put_uobj_write(uobj); idr_remove_uobj(&ib_uverbs_pd_idr, uobj); @@ -667,6 +652,10 @@ put_uobj(uobj); return in_len; + +err_put: + put_uobj_write(uobj); + return ret; } struct xrcd_table_entry { @@ -755,12 +744,13 @@ } ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_open_xrcd cmd; struct ib_uverbs_open_xrcd_resp resp; - struct ib_udata udata; + struct ib_udata udata; struct ib_uxrcd_object *obj; struct ib_xrcd *xrcd = NULL; struct fd f = {NULL}; @@ -776,7 +766,7 @@ INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + in_len - sizeof cmd, out_len - sizeof resp); mutex_lock(&file->device->xrcd_tree_mutex); @@ -797,7 +787,7 @@ } if (xrcd && cmd.oflags & O_EXCL) { - ret = -EINVAL; + ret = -EINVAL; goto err_tree_mutex_unlock; } } @@ -813,15 +803,14 @@ down_write(&obj->uobject.mutex); if (!xrcd) { - xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, - file->ucontext, &udata); + xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; } xrcd->inode = inode; - xrcd->device = file->device->ib_dev; + xrcd->device = ib_dev; atomic_set(&xrcd->usecnt, 0); mutex_init(&xrcd->tgt_qp_mutex); INIT_LIST_HEAD(&xrcd->tgt_qp_list); @@ -892,11 +881,12 @@ } ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_close_xrcd cmd; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_xrcd *xrcd = NULL; struct inode *inode = NULL; struct ib_uxrcd_object *obj; @@ -924,8 +914,8 @@ if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { ret = ib_dealloc_xrcd(uobj->object); - if (!ret) - uobj->live = 0; + if (!ret) + uobj->live = 0; } live = uobj->live; @@ -969,16 +959,17 @@ } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; - struct ib_udata udata; + struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; - int ret; + int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -1006,30 +997,35 @@ pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { - ret = -EINVAL; + ret = -EINVAL; goto err_free; } - /* We first get a new "obj id" to be passed later to reg mr for - further use as mr_id. - */ - ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); - if (ret) - goto err_put; + + if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + ret = -EINVAL; + goto err_put; + } + } mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, &udata, uobj->id); + cmd.access_flags, &udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); - goto err_remove_uobj; + goto err_put; } mr->device = pd->device; mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); uobj->object = mr; + ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); + if (ret) + goto err_unreg; memset(&resp, 0, sizeof resp); resp.lkey = mr->lkey; @@ -1055,11 +1051,11 @@ return in_len; err_copy: - ib_dereg_mr(mr); - -err_remove_uobj: idr_remove_uobj(&ib_uverbs_mr_idr, uobj); +err_unreg: + ib_dereg_mr(mr); + err_put: put_pd_read(pd); @@ -1068,14 +1064,104 @@ return ret; } +ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + struct ib_udata udata; + struct ib_pd *pd = NULL; + struct ib_mr *mr; + struct ib_pd *old_pd; + int ret; + struct ib_uobject *uobj; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + return -EINVAL; + + if ((cmd.flags & IB_MR_REREG_TRANS) && + (!cmd.start || !cmd.hca_va || 0 >= cmd.length || + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, + file->ucontext); + + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + if (cmd.flags & IB_MR_REREG_ACCESS) { + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + goto put_uobjs; + } + + if (cmd.flags & IB_MR_REREG_PD) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto put_uobjs; + } + } + + old_pd = mr->pd; + ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, + cmd.length, cmd.hca_va, + cmd.access_flags, pd, &udata); + if (!ret) { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); + } + } else { + goto put_uobj_pd; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + else + ret = in_len; + +put_uobj_pd: + if (cmd.flags & IB_MR_REREG_PD) + put_pd_read(pd); + +put_uobjs: + + put_uobj_write(mr->uobject); + + return ret; +} + ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; struct ib_uobject *uobj; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -1107,14 +1193,16 @@ } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_alloc_mw cmd; struct ib_uverbs_alloc_mw_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; + struct ib_udata udata; int ret; if (out_len < sizeof(resp)) @@ -1136,7 +1224,12 @@ goto err_free; } - mw = pd->device->alloc_mw(pd, cmd.mw_type); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); + + mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); goto err_put; @@ -1178,7 +1271,7 @@ idr_remove_uobj(&ib_uverbs_mw_idr, uobj); err_unalloc: - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); err_put: put_pd_read(pd); @@ -1189,13 +1282,14 @@ } ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_mw cmd; struct ib_mw *mw; - struct ib_uobject *uobj; - int ret = -EINVAL; + struct ib_uobject *uobj; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; @@ -1206,7 +1300,7 @@ mw = uobj->object; - ret = ib_dealloc_mw(mw); + ret = uverbs_dealloc_mw(mw); if (!ret) uobj->live = 0; @@ -1227,8 +1321,9 @@ } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_comp_channel cmd; struct ib_uverbs_create_comp_channel_resp resp; @@ -1241,12 +1336,12 @@ if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = get_unused_fd(); + ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) return ret; resp.fd = ret; - filp = ib_uverbs_alloc_event_file(file, 0); + filp = ib_uverbs_alloc_event_file(file, ib_dev, 0); if (IS_ERR(filp)) { put_unused_fd(resp.fd); return PTR_ERR(filp); @@ -1263,40 +1358,34 @@ return in_len; } -static ssize_t create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len, void *vcmd, int ex, - void __user *response) +static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw, + struct ib_uverbs_ex_create_cq *cmd, + size_t cmd_sz, + int (*cb)(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *udata, + void *context), + void *context) { - struct ib_uverbs_create_cq *cmd; - struct ib_uverbs_create_cq_ex *cmd_e; - struct ib_uverbs_create_cq_resp resp; - struct ib_udata udata; struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file = NULL; struct ib_cq *cq; - struct ib_cq_init_attr attr; - int cmd_sz; int ret; - - if (out_len < sizeof resp) - return -ENOSPC; - - cmd = vcmd; - cmd_e = vcmd; - cmd_sz = ex ? sizeof(*cmd_e) : sizeof(*cmd); - INIT_UDATA(&udata, buf + cmd_sz, response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + struct ib_uverbs_ex_create_cq_resp resp; + struct ib_cq_init_attr attr = {}; if (cmd->comp_vector >= file->device->num_comp_vectors) - return -EINVAL; + return ERR_PTR(-EINVAL); obj = kmalloc(sizeof *obj, GFP_KERNEL); if (!obj) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, - &cq_lock_class); + init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class); down_write(&obj->uobject.mutex); if (cmd->comp_channel >= 0) { @@ -1313,19 +1402,20 @@ INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - memset(&attr, 0, sizeof(attr)); attr.cqe = cmd->cqe; attr.comp_vector = cmd->comp_vector; - if (ex && (cmd_e->comp_mask & IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS)) - attr.flags = cmd_e->create_flags; - cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr, - file->ucontext, &udata); + + if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) + attr.flags = cmd->flags; + + cq = ib_dev->create_cq(ib_dev, &attr, + file->ucontext, uhw); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; } - cq->device = file->device->ib_dev; + cq->device = ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; @@ -1338,13 +1428,15 @@ goto err_free; memset(&resp, 0, sizeof resp); - resp.cq_handle = obj->uobject.id; - resp.cqe = cq->cqe; + resp.base.cq_handle = obj->uobject.id; + resp.base.cqe = cq->cqe; - if (copy_to_user(response, &resp, sizeof(resp))) { - ret = -EFAULT; - goto err_copy; - } + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + + ret = cb(file, obj, &resp, ucore, context); + if (ret) + goto err_cb; mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); @@ -1354,9 +1446,9 @@ up_write(&obj->uobject.mutex); - return in_len; + return obj; -err_copy: +err_cb: idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); err_free: @@ -1368,24 +1460,112 @@ err: put_uobj_write(&obj->uobject); - return ret; + + return ERR_PTR(ret); +} + +static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *ucore, void *context) +{ + if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) + return -EFAULT; + + return 0; } ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { - struct ib_uverbs_create_cq cmd; + struct ib_uverbs_create_cq cmd; + struct ib_uverbs_ex_create_cq cmd_ex; + struct ib_uverbs_create_cq_resp resp; + struct ib_udata ucore; + struct ib_udata uhw; + struct ib_ucq_object *obj; + + if (out_len < sizeof(resp)) + return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - return create_cq(file, buf, in_len, out_len, &cmd, - IB_USER_VERBS_CMD_BASIC, - (void __user *) (unsigned long) cmd.response); + INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), sizeof(resp)); + + INIT_UDATA(&uhw, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.cqe = cmd.cqe; + cmd_ex.comp_vector = cmd.comp_vector; + cmd_ex.comp_channel = cmd.comp_channel; + + obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex, + offsetof(typeof(cmd_ex), comp_channel) + + sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb, + NULL); + + if (IS_ERR(obj)) + return PTR_ERR(obj); + + return in_len; +} + +static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file, + struct ib_ucq_object *obj, + struct ib_uverbs_ex_create_cq_resp *resp, + struct ib_udata *ucore, void *context) +{ + if (ib_copy_to_udata(ucore, resp, resp->response_length)) + return -EFAULT; + + return 0; +} + +int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_cq_resp resp; + struct ib_uverbs_ex_create_cq cmd; + struct ib_ucq_object *obj; + int err; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + if (ucore->outlen < (offsetof(typeof(resp), response_length) + + sizeof(resp.response_length))) + return -ENOSPC; + + obj = create_cq(file, ib_dev, ucore, uhw, &cmd, + min(ucore->inlen, sizeof(cmd)), + ib_uverbs_ex_create_cq_cb, NULL); + + if (IS_ERR(obj)) + return PTR_ERR(obj); + + return 0; } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1449,8 +1629,9 @@ } ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_poll_cq cmd; struct ib_uverbs_poll_cq_resp resp; @@ -1458,7 +1639,7 @@ u8 __user *data_ptr; struct ib_cq *cq; struct ib_wc wc; - int ret; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -1500,6 +1681,7 @@ } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1522,6 +1704,7 @@ } ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1573,63 +1756,71 @@ return in_len; } -ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +static int create_qp(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw, + struct ib_uverbs_ex_create_qp *cmd, + size_t cmd_sz, + int (*cb)(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *udata), + void *context) { - void __user *response; - struct ib_udata udata; - struct ib_uqp_object *obj; - struct ib_device *device; - struct ib_pd *pd = NULL; - struct ib_xrcd *xrcd = NULL; - struct ib_uobject *uninitialized_var(xrcd_uobj); - struct ib_cq *scq = NULL, *rcq = NULL; - struct ib_srq *srq = NULL; - struct ib_qp *qp; - struct ib_qp_init_attr attr; - int ret; - union { - struct ib_uverbs_create_qp basic; - } cmd_obj; - struct ib_uverbs_create_qp *cmd; - size_t cmd_size = 0; - union { - struct ib_uverbs_create_qp_resp basic; - } resp_obj; - struct ib_uverbs_create_qp_resp *resp; - size_t resp_size = 0; - - cmd_size = sizeof(cmd_obj.basic); - cmd = &cmd_obj.basic; - - resp_size = sizeof(resp_obj.basic); - resp = &resp_obj.basic; - - if (out_len < resp_size) - return -ENOSPC; - - if (copy_from_user(&cmd_obj, buf, cmd_size)) - return -EFAULT; - - response = (void __user *) (unsigned long) cmd->response; + struct ib_uqp_object *obj; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_cq *scq = NULL, *rcq = NULL; + struct ib_srq *srq = NULL; + struct ib_qp *qp; + char *buf; + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_ex_create_qp_resp resp; + int ret; + struct ib_rwq_ind_table *ind_tbl = NULL; + bool has_sq = true; - if (!disable_raw_qp_enforcement && - cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW)) + if (cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW) != 0) return -EPERM; - INIT_UDATA(&udata, buf + cmd_size, response + resp_size, - in_len - cmd_size, out_len - resp_size); - obj = kzalloc(sizeof *obj, GFP_KERNEL); if (!obj) return -ENOMEM; - init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, + &qp_lock_class); down_write(&obj->uevent.uobject.mutex); + if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + + sizeof(cmd->rwq_ind_tbl_handle) && + (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { + ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle, + file->ucontext); + if (!ind_tbl) { + ret = -EINVAL; + goto err_put; + } + + attr.rwq_ind_tbl = ind_tbl; + } + + if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + + sizeof(cmd->reserved1)) && cmd->reserved1) { + ret = -EOPNOTSUPP; + goto err_put; + } + + if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) { + ret = -EINVAL; + goto err_put; + } + + if (ind_tbl && !cmd->max_send_wr) + has_sq = false; if (cmd->qp_type == IB_QPT_XRC_TGT) { - xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, &xrcd_uobj); + xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, + &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; @@ -1641,41 +1832,47 @@ cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { - srq = idr_read_srq(cmd->srq_handle, file->ucontext); + srq = idr_read_srq(cmd->srq_handle, + file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { - ret = -EINVAL; - goto err_put; - } - } - - if (cmd->recv_cq_handle != cmd->send_cq_handle) { - rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0); - if (!rcq) { - ret = -EINVAL; + ret = -EINVAL; goto err_put; + } } + + if (!ind_tbl) { + if (cmd->recv_cq_handle != cmd->send_cq_handle) { + rcq = idr_read_cq(cmd->recv_cq_handle, + file->ucontext, 0); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } + } } } - scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); - rcq = rcq ?: scq; + if (has_sq) + scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); + if (!ind_tbl) + rcq = rcq ?: scq; pd = idr_read_pd(cmd->pd_handle, file->ucontext); - if (!pd || !scq) { + if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; - } + } device = pd->device; - } + } - memset(&attr, 0, sizeof attr); attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; attr.xrcd = xrcd; - attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : + IB_SIGNAL_REQ_WR; attr.qp_type = cmd->qp_type; attr.create_flags = 0; @@ -1689,10 +1886,31 @@ INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); + if (cmd_sz >= offsetof(typeof(*cmd), create_flags) + + sizeof(cmd->create_flags)) + attr.create_flags = cmd->create_flags; + + if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV | + IB_QP_CREATE_SCATTER_FCS)) { + ret = -EINVAL; + goto err_put; + } + + buf = (char *)cmd + sizeof(*cmd); + if (cmd_sz > sizeof(*cmd)) + if (!(buf[0] == 0 && !memcmp(buf, buf + 1, + cmd_sz - sizeof(*cmd) - 1))) { + ret = -EINVAL; + goto err_put; + } + if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else - qp = device->create_qp(pd, &attr, &udata); + qp = device->create_qp(pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); @@ -1706,16 +1924,20 @@ qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; + qp->rwq_ind_tbl = ind_tbl; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); + if (attr.send_cq) + atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); + if (ind_tbl) + atomic_inc(&ind_tbl->usecnt); } qp->uobject = &obj->uevent.uobject; @@ -1724,25 +1946,28 @@ if (ret) goto err_destroy; - memset(&resp_obj, 0, sizeof(resp_obj)); - resp->qpn = qp->qp_num; - resp->qp_handle = obj->uevent.uobject.id; - resp->max_recv_sge = attr.cap.max_recv_sge; - resp->max_send_sge = attr.cap.max_send_sge; - resp->max_recv_wr = attr.cap.max_recv_wr; - resp->max_send_wr = attr.cap.max_send_wr; - resp->max_inline_data = attr.cap.max_inline_data; - - if (copy_to_user(response, &resp_obj, resp_size)) { - ret = -EFAULT; - goto err_copy; - } + memset(&resp, 0, sizeof resp); + resp.base.qpn = qp->qp_num; + resp.base.qp_handle = obj->uevent.uobject.id; + resp.base.max_recv_sge = attr.cap.max_recv_sge; + resp.base.max_send_sge = attr.cap.max_send_sge; + resp.base.max_recv_wr = attr.cap.max_recv_wr; + resp.base.max_send_wr = attr.cap.max_send_wr; + resp.base.max_inline_data = attr.cap.max_inline_data; + + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + + ret = cb(file, &resp, ucore); + if (ret) + goto err_cb; if (xrcd) { - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); - } + } if (pd) put_pd_read(pd); @@ -1752,6 +1977,8 @@ put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); @@ -1761,9 +1988,8 @@ up_write(&obj->uevent.uobject.mutex); - return in_len; - -err_copy: + return 0; +err_cb: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: @@ -1780,12 +2006,123 @@ put_cq_read(rcq); if (srq) put_srq_read(srq); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); put_uobj_write(&obj->uevent.uobject); return ret; } +static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *ucore) +{ + if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base))) + return -EFAULT; + + return 0; +} + +ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_qp cmd; + struct ib_uverbs_ex_create_qp cmd_ex; + struct ib_udata ucore; + struct ib_udata uhw; + ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp); + int err; + + if (out_len < resp_size) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), + resp_size); + INIT_UDATA(&uhw, buf + sizeof(cmd), + (unsigned long)cmd.response + resp_size, + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - resp_size); + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.pd_handle = cmd.pd_handle; + cmd_ex.send_cq_handle = cmd.send_cq_handle; + cmd_ex.recv_cq_handle = cmd.recv_cq_handle; + cmd_ex.srq_handle = cmd.srq_handle; + cmd_ex.max_send_wr = cmd.max_send_wr; + cmd_ex.max_recv_wr = cmd.max_recv_wr; + cmd_ex.max_send_sge = cmd.max_send_sge; + cmd_ex.max_recv_sge = cmd.max_recv_sge; + cmd_ex.max_inline_data = cmd.max_inline_data; + cmd_ex.sq_sig_all = cmd.sq_sig_all; + cmd_ex.qp_type = cmd.qp_type; + cmd_ex.is_srq = cmd.is_srq; + + err = create_qp(file, &ucore, &uhw, &cmd_ex, + offsetof(typeof(cmd_ex), is_srq) + + sizeof(cmd.is_srq), ib_uverbs_create_qp_cb, + NULL); + + if (err) + return err; + + return in_len; +} + +static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file, + struct ib_uverbs_ex_create_qp_resp *resp, + struct ib_udata *ucore) +{ + if (ib_copy_to_udata(ucore, resp, resp->response_length)) + return -EFAULT; + + return 0; +} + +int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_qp_resp resp; + struct ib_uverbs_ex_create_qp cmd = {0}; + int err; + + if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) + + sizeof(cmd.comp_mask))) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (err) + return err; + + if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + if (ucore->outlen < (offsetof(typeof(resp), response_length) + + sizeof(resp.response_length))) + return -ENOSPC; + + err = create_qp(file, ucore, uhw, &cmd, + min(ucore->inlen, sizeof(cmd)), + ib_uverbs_ex_create_qp_cb, NULL); + + if (err) + return err; + + return 0; +} + ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_qp cmd; @@ -1834,7 +2171,7 @@ if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; - } + } qp->uobject = &obj->uevent.uobject; @@ -1862,6 +2199,7 @@ mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; + up_write(&obj->uevent.uobject.mutex); return in_len; @@ -1879,8 +2217,9 @@ } ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_query_qp cmd; struct ib_uverbs_query_qp_resp resp; @@ -1992,45 +2331,31 @@ } } -static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file, +ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, - int out_len, - enum uverbs_cmd_type cmd_type) + int out_len) { - struct ib_uverbs_modify_qp_ex cmd; - struct ib_udata udata; - struct ib_qp *qp; - struct ib_qp_attr *attr; - struct ib_qp_attr_ex *attrx; - int ret; - void *p; - union ib_gid sgid; - union ib_gid *dgid; - u8 port_num; - - if (cmd_type == IB_USER_VERBS_CMD_BASIC) { - p = &cmd; - p += sizeof(cmd.comp_mask); - if (copy_from_user(p, buf, - sizeof(struct ib_uverbs_modify_qp))) + struct ib_uverbs_modify_qp cmd; + struct ib_udata udata; + struct ib_qp *qp; + struct ib_qp_attr *attr; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - } else { - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; - } INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); - attrx = kzalloc(sizeof(*attrx), GFP_KERNEL); - if (!attrx) + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) return -ENOMEM; - attr = (struct ib_qp_attr *)attrx; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { - kfree(attrx); - return -EINVAL; + ret = -EINVAL; + goto out; } attr->qp_state = cmd.qp_state; @@ -2078,77 +2403,39 @@ attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; - port_num = (cmd.attr_mask & IB_QP_PORT) ? cmd.port_num : qp->port_num; - if ((cmd.attr_mask & IB_QP_AV) && port_num && - (rdma_port_get_link_layer(qp->device, port_num) == - IB_LINK_LAYER_ETHERNET)) { - ret = ib_query_gid(qp->device, port_num, - attr->ah_attr.grh.sgid_index, &sgid); - if (ret) - goto out; - dgid = &attr->ah_attr.grh.dgid; - if (rdma_link_local_addr((struct in6_addr *)dgid->raw)) { - rdma_get_ll_mac((struct in6_addr *)dgid->raw, - attr->ah_attr.dmac); - rdma_get_ll_mac((struct in6_addr *)sgid.raw, - attr->smac); - attr->vlan_id = rdma_get_vlan_id(&sgid); - } else { - ret = rdma_addr_find_dmac_by_grh(&sgid, dgid, - attr->ah_attr.dmac, - &attr->vlan_id, -1U); - if (ret) - goto out; - ret = rdma_addr_find_smac_by_sgid(&sgid, attr->smac, - NULL, -1U); - if (ret) - goto out; - } - cmd.attr_mask |= IB_QP_SMAC; - if (attr->vlan_id < 0xFFFF) - cmd.attr_mask |= IB_QP_VID; - } - if (cmd_type == IB_USER_VERBS_CMD_EXTENDED) { - if (cmd.comp_mask & IB_UVERBS_QP_ATTR_DCT_KEY) - attrx->dct_key = cmd.dct_key; - } if (qp->real_qp == qp) { + ret = ib_resolve_eth_dmac(qp, attr, &cmd.attr_mask); + if (ret) + goto release_qp; ret = qp->device->modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); - if (!ret && (cmd.attr_mask & IB_QP_PORT)) - qp->port_num = attr->port_num; } else { ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); } if (ret) - goto out; + goto release_qp; ret = in_len; -out: +release_qp: put_qp_read(qp); - kfree(attrx); - return ret; -} +out: + kfree(attr); -ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - return __uverbs_modify_qp(file, buf, in_len, out_len, - IB_USER_VERBS_CMD_BASIC); + return ret; } ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; @@ -2200,18 +2487,26 @@ return in_len; } +static void *alloc_wr(size_t wr_size, __u32 num_sge) +{ + return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) + + num_sge * sizeof (struct ib_sge), GFP_KERNEL); +}; + ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_post_send cmd; struct ib_uverbs_post_send_resp resp; struct ib_uverbs_send_wr *user_wr; struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; - struct ib_qp *qp; + struct ib_qp *qp; int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; + size_t next_size; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2247,17 +2542,90 @@ goto out_put; } - next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + - user_wr->num_sge * sizeof (struct ib_sge), - GFP_KERNEL); - if (!next) { - ret = -ENOMEM; - goto out_put; - } + if (is_ud) { + struct ib_ud_wr *ud; + + if (user_wr->opcode != IB_WR_SEND && + user_wr->opcode != IB_WR_SEND_WITH_IMM) { + ret = -EINVAL; + goto out_put; + } + + next_size = sizeof(*ud); + ud = alloc_wr(next_size, user_wr->num_sge); + if (!ud) { + ret = -ENOMEM; + goto out_put; + } + + ud->ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); + if (!ud->ah) { + kfree(ud); + ret = -EINVAL; + goto out_put; + } + ud->remote_qpn = user_wr->wr.ud.remote_qpn; + ud->remote_qkey = user_wr->wr.ud.remote_qkey; + + next = &ud->wr; + } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE || + user_wr->opcode == IB_WR_RDMA_READ) { + struct ib_rdma_wr *rdma; + + next_size = sizeof(*rdma); + rdma = alloc_wr(next_size, user_wr->num_sge); + if (!rdma) { + ret = -ENOMEM; + goto out_put; + } + + rdma->remote_addr = user_wr->wr.rdma.remote_addr; + rdma->rkey = user_wr->wr.rdma.rkey; + + next = &rdma->wr; + } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + struct ib_atomic_wr *atomic; + + next_size = sizeof(*atomic); + atomic = alloc_wr(next_size, user_wr->num_sge); + if (!atomic) { + ret = -ENOMEM; + goto out_put; + } + + atomic->remote_addr = user_wr->wr.atomic.remote_addr; + atomic->compare_add = user_wr->wr.atomic.compare_add; + atomic->swap = user_wr->wr.atomic.swap; + atomic->rkey = user_wr->wr.atomic.rkey; + + next = &atomic->wr; + } else if (user_wr->opcode == IB_WR_SEND || + user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_SEND_WITH_INV) { + next_size = sizeof(*next); + next = alloc_wr(next_size, user_wr->num_sge); + if (!next) { + ret = -ENOMEM; + goto out_put; + } + } else { + ret = -EINVAL; + goto out_put; + } + + if (user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) { + next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey; + } if (!last) wr = next; - else + else last->next = next; last = next; @@ -2267,54 +2635,11 @@ next->opcode = user_wr->opcode; next->send_flags = user_wr->send_flags; - if (is_ud) { - next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, - file->ucontext); - if (!next->wr.ud.ah) { - ret = -EINVAL; - goto out_put; - } - next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; - next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; - } else { - switch (next->opcode) { - case IB_WR_RDMA_WRITE_WITH_IMM: - next->ex.imm_data = - (__be32 __force) user_wr->ex.imm_data; - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - next->wr.rdma.remote_addr = - user_wr->wr.rdma.remote_addr; - next->wr.rdma.rkey = - user_wr->wr.rdma.rkey; - break; - case IB_WR_SEND_WITH_IMM: - next->ex.imm_data = - (__be32 __force) user_wr->ex.imm_data; - break; - case IB_WR_SEND_WITH_INV: - next->ex.invalidate_rkey = - user_wr->ex.invalidate_rkey; - break; - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - next->wr.atomic.remote_addr = - user_wr->wr.atomic.remote_addr; - next->wr.atomic.compare_add = - user_wr->wr.atomic.compare_add; - next->wr.atomic.swap = user_wr->wr.atomic.swap; - next->wr.atomic.rkey = user_wr->wr.atomic.rkey; - break; - default: - break; - } - } - if (next->num_sge) { - next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + next->sg_list = (void *)((char *)next + + ALIGN(next_size, sizeof(struct ib_sge))); if (copy_from_user(next->sg_list, - buf + sizeof cmd + + (const char *)buf + sizeof cmd + cmd.wr_count * cmd.wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { @@ -2332,7 +2657,7 @@ for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) - break; + break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, @@ -2343,8 +2668,8 @@ put_qp_read(qp); while (wr) { - if (is_ud && wr->wr.ud.ah) - put_ah_read(wr->wr.ud.ah); + if (is_ud && ud_wr(wr)->ah) + put_ah_read(ud_wr(wr)->ah); next = wr->next; kfree(wr); wr = next; @@ -2366,7 +2691,7 @@ struct ib_recv_wr *wr = NULL, *last, *next; int sg_ind; int i; - int ret; + int ret; if (in_len < wqe_size * wr_count + sge_count * sizeof (struct ib_uverbs_sge)) @@ -2389,9 +2714,9 @@ } if (user_wr->num_sge + sg_ind > sge_count) { - ret = -EINVAL; - goto err; - } + ret = -EINVAL; + goto err; + } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), @@ -2399,7 +2724,7 @@ if (!next) { ret = -ENOMEM; goto err; - } + } if (!last) wr = next; @@ -2412,10 +2737,10 @@ next->num_sge = user_wr->num_sge; if (next->num_sge) { - next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + next->sg_list = (void *)((char *)next + + ALIGN(sizeof *next, sizeof (struct ib_sge))); if (copy_from_user(next->sg_list, - buf + wr_count * wqe_size + + (const char *)buf + wr_count * wqe_size + sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; @@ -2442,6 +2767,7 @@ } ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2491,6 +2817,7 @@ } ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2540,8 +2867,9 @@ } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_ah cmd; struct ib_uverbs_create_ah_resp resp; @@ -2564,7 +2892,7 @@ init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); down_write(&uobj->mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; @@ -2580,6 +2908,7 @@ attr.grh.sgid_index = cmd.attr.grh.sgid_index; attr.grh.hop_limit = cmd.attr.grh.hop_limit; attr.grh.traffic_class = cmd.attr.grh.traffic_class; + memset(&attr.dmac, 0, sizeof(attr.dmac)); memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); ah = ib_create_ah(pd, &attr); @@ -2630,12 +2959,13 @@ } ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_ah cmd; struct ib_ah *ah; struct ib_uobject *uobj; - int ret; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2666,6 +2996,7 @@ } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2713,14 +3044,15 @@ } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_detach_mcast cmd; struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2749,474 +3081,529 @@ return ret ? ret : in_len; } -static int __uverbs_create_xsrq(struct ib_uverbs_file *file, - struct ib_uverbs_create_xsrq *cmd, - struct ib_udata *udata) +static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec) { - struct ib_uverbs_create_srq_resp resp; - struct ib_usrq_object *obj; - struct ib_pd *pd; - struct ib_srq *srq; - struct ib_uobject *uninitialized_var(xrcd_uobj); - struct ib_srq_init_attr attr; - int ret; + /* Returns user space filter size, includes padding */ + return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2; +} - obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; +static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size, + u16 ib_real_filter_sz) +{ + /* + * User space filter structures must be 64 bit aligned, otherwise this + * may pass, but we won't handle additional new attributes. + */ - init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); - down_write(&obj->uevent.uobject.mutex); + if (kern_filter_size > ib_real_filter_sz) { + if (memchr_inv((char *)kern_spec_filter + + ib_real_filter_sz, 0, + kern_filter_size - ib_real_filter_sz)) + return -EINVAL; + return ib_real_filter_sz; + } + return kern_filter_size; +} - if (cmd->srq_type == IB_SRQT_XRC) { - attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); - if (!attr.ext.xrc.xrcd) { - ret = -EINVAL; - goto err; - } +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + ssize_t actual_filter_sz; + ssize_t kern_filter_sz; + ssize_t ib_filter_sz; + void *kern_spec_mask; + void *kern_spec_val; - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); - atomic_inc(&obj->uxrcd->refcnt); + if (kern_spec->reserved) + return -EINVAL; - attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); - if (!attr.ext.xrc.cq) { - ret = -EINVAL; - goto err_put_xrcd; - } + ib_spec->type = kern_spec->type; + + kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); + /* User flow spec size must be aligned to 4 bytes */ + if (kern_filter_sz != ALIGN(kern_filter_sz, 4)) + return -EINVAL; + + kern_spec_val = (char *)kern_spec + + sizeof(struct ib_uverbs_flow_spec_hdr); + kern_spec_mask = (char *)kern_spec_val + kern_filter_sz; + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_eth); + memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_IPV4: + ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_ipv4); + memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_IPV6: + ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_ipv6); + memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz); + + if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) || + (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20)) + return -EINVAL; + break; + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp); + memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz); + break; + default: + return -EINVAL; } + return 0; +} - pd = idr_read_pd(cmd->pd_handle, file->ucontext); - if (!pd) { - ret = -EINVAL; - goto err_put_cq; - } +int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_create_wq cmd = {}; + struct ib_uverbs_ex_create_wq_resp resp = {}; + struct ib_uwq_object *obj; + int err = 0; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_wq *wq; + struct ib_wq_init_attr wq_init_attr = {}; + size_t required_cmd_sz; + size_t required_resp_len; - attr.event_handler = ib_uverbs_srq_event_handler; - attr.srq_context = file; - attr.srq_type = cmd->srq_type; - attr.attr.max_wr = cmd->max_wr; - attr.attr.max_sge = cmd->max_sge; - attr.attr.srq_limit = cmd->srq_limit; + required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge); + required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn); - obj->uevent.events_reported = 0; - INIT_LIST_HEAD(&obj->uevent.event_list); + if (ucore->inlen < required_cmd_sz) + return -EINVAL; - srq = pd->device->create_srq(pd, &attr, udata); - if (IS_ERR(srq)) { - ret = PTR_ERR(srq); - goto err_put; - } + if (ucore->outlen < required_resp_len) + return -ENOSPC; - srq->device = pd->device; - srq->pd = pd; - srq->srq_type = cmd->srq_type; - srq->uobject = &obj->uevent.uobject; - srq->event_handler = attr.event_handler; - srq->srq_context = attr.srq_context; + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; - if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.cq = attr.ext.xrc.cq; - srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.cq->usecnt); - atomic_inc(&attr.ext.xrc.xrcd->usecnt); - } + err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + if (err) + return err; - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); + if (cmd.comp_mask) + return -EOPNOTSUPP; - obj->uevent.uobject.object = srq; - ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); - if (ret) - goto err_destroy; + obj = kmalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; - memset(&resp, 0, sizeof resp); - resp.srq_handle = obj->uevent.uobject.id; - resp.max_wr = attr.attr.max_wr; - resp.max_sge = attr.attr.max_sge; - if (cmd->srq_type == IB_SRQT_XRC) - resp.srqn = srq->ext.xrc.srq_num; + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, + &wq_lock_class); + down_write(&obj->uevent.uobject.mutex); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + err = -EINVAL; + goto err_uobj; + } - if (copy_to_user((void __user *) (unsigned long) cmd->response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_copy; + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) { + err = -EINVAL; + goto err_put_pd; } - if (cmd->srq_type == IB_SRQT_XRC) { - put_uobj_read(xrcd_uobj); - put_cq_read(attr.ext.xrc.cq); + wq_init_attr.cq = cq; + wq_init_attr.max_sge = cmd.max_sge; + wq_init_attr.max_wr = cmd.max_wr; + wq_init_attr.wq_context = file; + wq_init_attr.wq_type = cmd.wq_type; + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq = pd->device->create_wq(pd, &wq_init_attr, uhw); + if (IS_ERR(wq)) { + err = PTR_ERR(wq); + goto err_put_cq; } + + wq->uobject = &obj->uevent.uobject; + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = &obj->uevent.uobject; + obj->uevent.uobject.object = wq; + err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); + if (err) + goto destroy_wq; + + memset(&resp, 0, sizeof(resp)); + resp.wq_handle = obj->uevent.uobject.id; + resp.max_sge = wq_init_attr.max_sge; + resp.max_wr = wq_init_attr.max_wr; + resp.wqn = wq->wq_num; + resp.response_length = required_resp_len; + err = ib_copy_to_udata(ucore, + &resp, resp.response_length); + if (err) + goto err_copy; + put_pd_read(pd); + put_cq_read(cq); mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list); mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; - up_write(&obj->uevent.uobject.mutex); - return 0; err_copy: - idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); - -err_destroy: - ib_destroy_srq(srq); - -err_put: - put_pd_read(pd); - + idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); +destroy_wq: + ib_destroy_wq(wq); err_put_cq: - if (cmd->srq_type == IB_SRQT_XRC) - put_cq_read(attr.ext.xrc.cq); - -err_put_xrcd: - if (cmd->srq_type == IB_SRQT_XRC) { - atomic_dec(&obj->uxrcd->refcnt); - put_uobj_read(xrcd_uobj); - } - -err: + put_cq_read(cq); +err_put_pd: + put_pd_read(pd); +err_uobj: put_uobj_write(&obj->uevent.uobject); - return ret; + + return err; } -ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_create_srq cmd; - struct ib_uverbs_create_xsrq xcmd; - struct ib_uverbs_create_srq_resp resp; - struct ib_udata udata; - int ret; + struct ib_uverbs_ex_destroy_wq cmd = {}; + struct ib_uverbs_ex_destroy_wq_resp resp = {}; + struct ib_wq *wq; + struct ib_uobject *uobj; + struct ib_uwq_object *obj; + size_t required_cmd_sz; + size_t required_resp_len; + int ret; - if (out_len < sizeof resp) - return -ENOSPC; + required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle); + required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + if (ucore->inlen < required_cmd_sz) + return -EINVAL; - xcmd.response = cmd.response; - xcmd.user_handle = cmd.user_handle; - xcmd.srq_type = IB_SRQT_BASIC; - xcmd.pd_handle = cmd.pd_handle; - xcmd.max_wr = cmd.max_wr; - xcmd.max_sge = cmd.max_sge; - xcmd.srq_limit = cmd.srq_limit; + if (ucore->outlen < required_resp_len) + return -ENOSPC; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; - ret = __uverbs_create_xsrq(file, &xcmd, &udata); + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) return ret; - return in_len; -} - -ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, int out_len) -{ - struct ib_uverbs_create_xsrq cmd; - struct ib_uverbs_create_srq_resp resp; - struct ib_udata udata; - int ret; - - if (out_len < sizeof resp) - return -ENOSPC; + if (cmd.comp_mask) + return -EOPNOTSUPP; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + resp.response_length = required_resp_len; + uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle, + file->ucontext); + if (!uobj) + return -EINVAL; - INIT_UDATA(&udata, buf + sizeof cmd, - (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + wq = uobj->object; + obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); + ret = ib_destroy_wq(wq); + if (!ret) + uobj->live = 0; - ret = __uverbs_create_xsrq(file, &cmd, &udata); + put_uobj_write(uobj); if (ret) return ret; - return in_len; -} + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); -ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - struct ib_uverbs_modify_srq cmd; - struct ib_udata udata; - struct ib_srq *srq; - struct ib_srq_attr attr; - int ret; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, - out_len); - - srq = idr_read_srq(cmd.srq_handle, file->ucontext); - if (!srq) - return -EINVAL; - - attr.max_wr = cmd.max_wr; - attr.srq_limit = cmd.srq_limit; + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); - ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); + ib_uverbs_release_uevent(file, &obj->uevent); + resp.events_reported = obj->uevent.events_reported; + put_uobj(uobj); - put_srq_read(srq); + ret = ib_copy_to_udata(ucore, &resp, resp.response_length); + if (ret) + return ret; - return ret ? ret : in_len; + return 0; } -ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, - const char __user *buf, - int in_len, int out_len) +int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_query_srq cmd; - struct ib_uverbs_query_srq_resp resp; - struct ib_srq_attr attr; - struct ib_srq *srq; - int ret; - - if (out_len < sizeof resp) - return -ENOSPC; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + struct ib_uverbs_ex_modify_wq cmd = {}; + struct ib_wq *wq; + struct ib_wq_attr wq_attr = {}; + size_t required_cmd_sz; + int ret; - srq = idr_read_srq(cmd.srq_handle, file->ucontext); - if (!srq) + required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state); + if (ucore->inlen < required_cmd_sz) return -EINVAL; - ret = ib_query_srq(srq, &attr); - - put_srq_read(srq); + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) - return ret; + return ret; - memset(&resp, 0, sizeof resp); + if (!cmd.attr_mask) + return -EINVAL; - resp.max_wr = attr.max_wr; - resp.max_sge = attr.max_sge; - resp.srq_limit = attr.srq_limit; + if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE)) + return -EINVAL; - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) - return -EFAULT; + wq = idr_read_wq(cmd.wq_handle, file->ucontext); + if (!wq) + return -EINVAL; - return in_len; + wq_attr.curr_wq_state = cmd.curr_wq_state; + wq_attr.wq_state = cmd.wq_state; + ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw); + put_wq_read(wq); + return ret; } -ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_destroy_srq cmd; - struct ib_uverbs_destroy_srq_resp resp; - struct ib_uobject *uobj; - struct ib_srq *srq; - struct ib_uevent_object *obj; - int ret = -EINVAL; - struct ib_usrq_object *us; - enum ib_srq_type srq_type; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; - - uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); - if (!uobj) + struct ib_uverbs_ex_create_rwq_ind_table cmd = {}; + struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; + struct ib_uobject *uobj; + int err = 0; + struct ib_rwq_ind_table_init_attr init_attr = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_wq **wqs = NULL; + u32 *wqs_handles = NULL; + struct ib_wq *wq = NULL; + int i, j, num_read_wqs; + u32 num_wq_handles; + u32 expected_in_size; + size_t required_cmd_sz_header; + size_t required_resp_len; + + required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size); + required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num); + + if (ucore->inlen < required_cmd_sz_header) return -EINVAL; - srq = uobj->object; - obj = container_of(uobj, struct ib_uevent_object, uobject); - srq_type = srq->srq_type; - - ret = ib_destroy_srq(srq); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); - - if (ret) - return ret; - - if (srq_type == IB_SRQT_XRC) { - us = container_of(obj, struct ib_usrq_object, uevent); - atomic_dec(&us->uxrcd->refcnt); - } - - idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - ib_uverbs_release_uevent(file, obj); + if (ucore->outlen < required_resp_len) + return -ENOSPC; - memset(&resp, 0, sizeof resp); - resp.events_reported = obj->events_reported; + err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header); + if (err) + return err; - put_uobj(uobj); + ucore->inbuf = (const char *)ucore->inbuf + required_cmd_sz_header; + ucore->inlen -= required_cmd_sz_header; - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) - ret = -EFAULT; + if (cmd.comp_mask) + return -EOPNOTSUPP; - return ret ? ret : in_len; -} + if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE) + return -EINVAL; -ssize_t ib_uverbs_exp_create_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) -{ - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_create_dct cmd; - struct ib_uverbs_create_dct_resp resp; - struct ib_udata udata; - struct ib_udct_object *obj; - struct ib_dct *dct; - int ret; - struct ib_dct_init_attr attr; - struct ib_pd *pd = NULL; - struct ib_cq *cq = NULL; - struct ib_srq *srq = NULL; + num_wq_handles = 1 << cmd.log_ind_tbl_size; + expected_in_size = num_wq_handles * sizeof(__u32); + if (num_wq_handles == 1) + /* input size for wq handles is u64 aligned */ + expected_in_size += sizeof(__u32); - if (out_len < sizeof(resp)) - return -ENOSPC; + if (ucore->inlen < expected_in_size) + return -EINVAL; - ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); - if (ret) - return ret; + if (ucore->inlen > expected_in_size && + !ib_is_udata_cleared(ucore, expected_in_size, + ucore->inlen - expected_in_size)) + return -EOPNOTSUPP; - obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) + wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles), + GFP_KERNEL); + if (!wqs_handles) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, - &dct_lock_class); - down_write(&obj->uobject.mutex); + err = ib_copy_from_udata(wqs_handles, ucore, + num_wq_handles * sizeof(__u32)); + if (err) + goto err_free; - pd = idr_read_pd(cmd.pd_handle, file->ucontext); - if (!pd) { - ret = -EINVAL; - goto err_pd; + wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL); + if (!wqs) { + err = -ENOMEM; + goto err_free; } - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); - if (!cq) { - ret = -EINVAL; - goto err_put; - } + for (num_read_wqs = 0; num_read_wqs < num_wq_handles; + num_read_wqs++) { + wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext); + if (!wq) { + err = -EINVAL; + goto put_wqs; + } - srq = idr_read_srq(cmd.srq_handle, file->ucontext); - if (!srq) { - ret = -EINVAL; - goto err_put; + wqs[num_read_wqs] = wq; } - attr.cq = cq; - attr.access_flags = cmd.access_flags; - attr.min_rnr_timer = cmd.min_rnr_timer; - attr.srq = srq; - attr.tclass = cmd.tclass; - attr.flow_label = cmd.flow_label; - attr.dc_key = cmd.dc_key; - attr.mtu = cmd.mtu; - attr.port = cmd.port; - attr.pkey_index = cmd.pkey_index; - attr.gid_index = cmd.gid_index; - attr.hop_limit = cmd.hop_limit; - attr.create_flags = cmd.create_flags; - - dct = ib_create_dct(pd, &attr, &udata); - if (IS_ERR(dct)) { - ret = PTR_ERR(dct); - goto err_put; + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto put_wqs; } - dct->device = file->device->ib_dev; - dct->uobject = &obj->uobject; + init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class); + down_write(&uobj->mutex); + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw); - obj->uobject.object = dct; - ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uobject); - if (ret) - goto err_dct; + if (IS_ERR(rwq_ind_tbl)) { + err = PTR_ERR(rwq_ind_tbl); + goto err_uobj; + } - memset(&resp, 0, sizeof(resp)); - resp.dct_handle = obj->uobject.id; - resp.dctn = dct->dct_num; + rwq_ind_tbl->ind_tbl = wqs; + rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; + rwq_ind_tbl->uobject = uobj; + uobj->object = rwq_ind_tbl; + rwq_ind_tbl->device = ib_dev; + atomic_set(&rwq_ind_tbl->usecnt, 0); - ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); - if (ret) - goto err_copy; + for (i = 0; i < num_wq_handles; i++) + atomic_inc(&wqs[i]->usecnt); - mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->dct_list); - mutex_unlock(&file->mutex); + err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + if (err) + goto destroy_ind_tbl; - obj->uobject.live = 1; + resp.ind_tbl_handle = uobj->id; + resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; + resp.response_length = required_resp_len; - put_srq_read(srq); - put_cq_read(cq); - put_pd_read(pd); + err = ib_copy_to_udata(ucore, + &resp, resp.response_length); + if (err) + goto err_copy; - up_write(&obj->uobject.mutex); + kfree(wqs_handles); - return in_len; + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); -err_copy: - idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uobject); + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list); + mutex_unlock(&file->mutex); -err_dct: - ib_destroy_dct(dct); + uobj->live = 1; -err_put: - if (srq) - put_srq_read(srq); + up_write(&uobj->mutex); + return 0; - if (cq) - put_cq_read(cq); +err_copy: + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); +destroy_ind_tbl: + ib_destroy_rwq_ind_table(rwq_ind_tbl); +err_uobj: + put_uobj_write(uobj); +put_wqs: + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); +err_free: + kfree(wqs_handles); + kfree(wqs); + return err; +} - put_pd_read(pd); +int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_uobject *uobj; + int ret; + struct ib_wq **ind_tbl; + size_t required_cmd_sz; -err_pd: - put_uobj_write(&obj->uobject); - return ret; -} + required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle); -ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) -{ - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_destroy_dct cmd; - struct ib_uverbs_destroy_dct_resp resp; - struct ib_uobject *uobj; - struct ib_dct *dct; - struct ib_udct_object *obj; - int ret; + if (ucore->inlen < required_cmd_sz) + return -EINVAL; - if (out_len < sizeof(resp)) - return -ENOSPC; + if (ucore->inlen > sizeof(cmd) && + !ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) + return -EOPNOTSUPP; - ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); if (ret) return ret; - uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.user_handle, file->ucontext); + if (cmd.comp_mask) + return -EOPNOTSUPP; + + uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle, + file->ucontext); if (!uobj) return -EINVAL; + rwq_ind_tbl = uobj->object; + ind_tbl = rwq_ind_tbl->ind_tbl; - dct = uobj->object; - obj = container_of(dct->uobject, struct ib_udct_object, uobject); - - ret = ib_destroy_dct(dct); + ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); if (!ret) uobj->live = 0; @@ -3225,138 +3612,25 @@ if (ret) return ret; - idr_remove_uobj(&ib_uverbs_dct_idr, uobj); + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); - memset(&resp, 0, sizeof(resp)); - put_uobj(uobj); - - ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); - if (ret) - return ret; - - return in_len; -} - -ssize_t ib_uverbs_exp_query_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) -{ - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_query_dct cmd; - struct ib_uverbs_query_dct_resp resp; - struct ib_dct *dct; - struct ib_dct_attr *attr; - int err; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - err = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); - if (err) - return err; - - attr = kmalloc(sizeof(*attr), GFP_KERNEL); - if (!attr) { - err = -ENOMEM; - goto out; - } - - dct = idr_read_dct(cmd.dct_handle, file->ucontext); - if (!dct) { - err = -EINVAL; - goto out; - } - - err = ib_query_dct(dct, attr); - - put_dct_read(dct); - - if (err) - goto out; - - memset(&resp, 0, sizeof(resp)); - - resp.dc_key = attr->dc_key; - resp.access_flags = attr->access_flags; - resp.flow_label = attr->flow_label; - resp.key_violations = attr->key_violations; - resp.port = attr->port; - resp.min_rnr_timer = attr->min_rnr_timer; - resp.tclass = attr->tclass; - resp.mtu = attr->mtu; - resp.pkey_index = attr->pkey_index; - resp.gid_index = attr->gid_index; - resp.hop_limit = attr->hop_limit; - resp.state = attr->state; - - err = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); - -out: - kfree(attr); - - return err ? err : in_len; -} - -/* - * Experimental functions - */ - -static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; - -static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, - union ib_flow_spec *ib_spec) -{ - ib_spec->type = kern_spec->type; - - switch (ib_spec->type) { - case IB_FLOW_SPEC_ETH: - ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); - memcpy(&ib_spec->eth.val, &kern_spec->eth.val, - sizeof(struct ib_flow_eth_filter)); - memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, - sizeof(struct ib_flow_eth_filter)); - break; - case IB_FLOW_SPEC_IB: - ib_spec->ib.size = sizeof(struct ib_flow_spec_ib); - memcpy(&ib_spec->ib.val, &kern_spec->ib.val, - sizeof(struct ib_flow_ib_filter)); - memcpy(&ib_spec->ib.mask, &kern_spec->ib.mask, - sizeof(struct ib_flow_ib_filter)); - break; - case IB_FLOW_SPEC_IPV4: - ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); - memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, - sizeof(struct ib_flow_ipv4_filter)); - memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, - sizeof(struct ib_flow_ipv4_filter)); - break; - case IB_FLOW_SPEC_TCP: - case IB_FLOW_SPEC_UDP: - ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); - memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, - sizeof(struct ib_flow_tcp_udp_filter)); - memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, - sizeof(struct ib_flow_tcp_udp_filter)); - break; - default: - return -EINVAL; - } - return 0; + kfree(ind_tbl); + return ret; } int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; @@ -3366,6 +3640,9 @@ void *ib_spec; int i; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + if (ucore->outlen < sizeof(resp)) return -ENOSPC; @@ -3373,15 +3650,23 @@ if (err) return err; - ucore->inbuf += sizeof(cmd); + ucore->inbuf = (const char *)ucore->inbuf + sizeof(cmd); ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; - if (priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement) + if (priv_check(curthread, PRIV_NET_RAW) != 0) return -EPERM; + if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) + return -EINVAL; + + if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) || + (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT))) + return -EINVAL; + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; @@ -3390,11 +3675,15 @@ (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) + return -EINVAL; + if (cmd.flow_attr.num_of_specs) { - kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + - cmd.flow_attr.size, GFP_KERNEL); + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); if (!kern_flow_attr) - return -ENOMEM; + return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); err = ib_copy_from_udata(kern_flow_attr + 1, ucore, @@ -3419,8 +3708,8 @@ goto err_uobj; } - flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, - GFP_KERNEL); + flow_attr = kzalloc(sizeof(*flow_attr) + cmd.flow_attr.num_of_specs * + sizeof(union ib_flow_spec), GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; @@ -3436,25 +3725,24 @@ kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && - cmd.flow_attr.size > - offsetof(struct ib_uverbs_flow_spec, reserved) && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && cmd.flow_attr.size >= ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += - ((union ib_flow_spec *)ib_spec)->size; - cmd.flow_attr.size -= - ((struct ib_uverbs_flow_spec *)kern_spec)->size; - kern_spec += ((struct ib_uverbs_flow_spec *)kern_spec)->size; - ib_spec += ((union ib_flow_spec *)ib_spec)->size; + ((union ib_flow_spec *) ib_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec = (char *)kern_spec + ((struct ib_uverbs_flow_spec *) kern_spec)->size; + ib_spec = (char *)ib_spec + ((union ib_flow_spec *)ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", i, cmd.flow_attr.size); + err = -EINVAL; goto err_free; - } + } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); @@ -3505,6 +3793,7 @@ } int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -3513,10 +3802,16 @@ struct ib_uobject *uobj; int ret; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (ret) return ret; + if (cmd.comp_mask) + return -EINVAL; + uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); if (!uobj) @@ -3540,375 +3835,417 @@ return ret; } -ssize_t ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +static int __uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_uverbs_create_xsrq *cmd, + struct ib_udata *udata) +{ + struct ib_uverbs_create_srq_resp resp; + struct ib_usrq_object *obj; + struct ib_pd *pd; + struct ib_srq *srq; + struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_srq_init_attr attr; + int ret; + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); + down_write(&obj->uevent.uobject.mutex); + + if (cmd->srq_type == IB_SRQT_XRC) { + attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); + if (!attr.ext.xrc.xrcd) { + ret = -EINVAL; + goto err; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + + attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); + if (!attr.ext.xrc.cq) { + ret = -EINVAL; + goto err_put_xrcd; + } + } + + pd = idr_read_pd(cmd->pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_put_cq; + } + + attr.event_handler = ib_uverbs_srq_event_handler; + attr.srq_context = file; + attr.srq_type = cmd->srq_type; + attr.attr.max_wr = cmd->max_wr; + attr.attr.max_sge = cmd->max_sge; + attr.attr.srq_limit = cmd->srq_limit; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + + srq = pd->device->create_srq(pd, &attr, udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put; + } + + srq->device = pd->device; + srq->pd = pd; + srq->srq_type = cmd->srq_type; + srq->uobject = &obj->uevent.uobject; + srq->event_handler = attr.event_handler; + srq->srq_context = attr.srq_context; + + if (cmd->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.cq = attr.ext.xrc.cq; + srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; + atomic_inc(&attr.ext.xrc.cq->usecnt); + atomic_inc(&attr.ext.xrc.xrcd->usecnt); + } + + atomic_inc(&pd->usecnt); + atomic_set(&srq->usecnt, 0); + + obj->uevent.uobject.object = srq; + ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.srq_handle = obj->uevent.uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + if (cmd->srq_type == IB_SRQT_XRC) + resp.srqn = srq->ext.xrc.srq_num; + + if (copy_to_user((void __user *) (unsigned long) cmd->response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + if (cmd->srq_type == IB_SRQT_XRC) { + put_uobj_read(xrcd_uobj); + put_cq_read(attr.ext.xrc.cq); + } + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); + mutex_unlock(&file->mutex); + + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); + + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); + +err_destroy: + ib_destroy_srq(srq); + +err_put: + put_pd_read(pd); + +err_put_cq: + if (cmd->srq_type == IB_SRQT_XRC) + put_cq_read(attr.ext.xrc.cq); + +err_put_xrcd: + if (cmd->srq_type == IB_SRQT_XRC) { + atomic_dec(&obj->uxrcd->refcnt); + put_uobj_read(xrcd_uobj); + } + +err: + put_uobj_write(&obj->uevent.uobject); + return ret; +} + +ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_srq cmd; + struct ib_uverbs_create_xsrq xcmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + xcmd.response = cmd.response; + xcmd.user_handle = cmd.user_handle; + xcmd.srq_type = IB_SRQT_BASIC; + xcmd.pd_handle = cmd.pd_handle; + xcmd.max_wr = cmd.max_wr; + xcmd.max_sge = cmd.max_sge; + xcmd.srq_limit = cmd.srq_limit; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); + + ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); + if (ret) + return ret; + + return in_len; +} + +ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, int out_len) { - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_create_xsrq cmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; + int ret; - return __uverbs_modify_qp(file, buf, in_len, out_len, - IB_USER_VERBS_CMD_EXTENDED); -} + if (out_len < sizeof resp) + return -ENOSPC; + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; -ssize_t ib_uverbs_exp_create_cq(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) -{ - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_create_cq_ex cmd; + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); + if (ret) + return ret; - return create_cq(file, buf, in_len, out_len, &cmd, - IB_USER_VERBS_CMD_EXTENDED, ucore->outbuf); + return in_len; } -ssize_t ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) { - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - struct ib_uverbs_modify_cq_ex cmd; - struct ib_cq *cq; - struct ib_cq_attr attr; - int ret; + struct ib_uverbs_modify_srq cmd; + struct ib_udata udata; + struct ib_srq *srq; + struct ib_srq_attr attr; + int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); - if (!cq) + INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, + out_len); + + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) return -EINVAL; - attr.moderation.cq_count = cmd.cq_count; - attr.moderation.cq_period = cmd.cq_period; - attr.cq_cap_flags = cmd.cq_cap_flags; + attr.max_wr = cmd.max_wr; + attr.srq_limit = cmd.srq_limit; - ret = ib_modify_cq(cq, &attr, cmd.attr_mask); + ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); - put_cq_read(cq); + put_srq_read(srq); return ret ? ret : in_len; } - -ssize_t ib_uverbs_exp_query_device(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, + int in_len, int out_len) { - struct ib_uverbs_exp_query_device_resp resp; - struct ib_exp_device_attr exp_attr; - int ret; + struct ib_uverbs_query_srq cmd; + struct ib_uverbs_query_srq_resp resp; + struct ib_srq_attr attr; + struct ib_srq *srq; + int ret; - if (ucore->outlen + uhw->outlen < sizeof(resp)) + if (out_len < sizeof resp) return -ENOSPC; - memset(&resp, 0, sizeof(resp)); - memset(&exp_attr, 0, sizeof(exp_attr)); - ret = ib_exp_query_device(file->device->ib_dev, &exp_attr); - if (ret) - return ret; + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; - ib_uverbs_query_device_assign(&resp.base, &exp_attr.base, file); + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) + return -EINVAL; - resp.comp_mask = 0; - resp.device_cap_flags2 = 0; + ret = ib_query_srq(srq, &attr); - /* - * Handle regular attr fields - */ - if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) { - resp.timestamp_mask = exp_attr.base.timestamp_mask; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK; - } + put_srq_read(srq); - if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { - resp.hca_core_clock = exp_attr.base.hca_core_clock; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; - } + if (ret) + return ret; - /* - * Handle experimental attr fields - */ - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2) { - resp.device_cap_flags2 = exp_attr.device_cap_flags2; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2; - } + memset(&resp, 0, sizeof resp); - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) { - resp.dc_rd_req = exp_attr.dc_rd_req; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD; - } + resp.max_wr = attr.max_wr; + resp.max_sge = attr.max_sge; + resp.srq_limit = attr.srq_limit; - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) { - resp.dc_rd_res = exp_attr.dc_rd_res; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD; - } + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) { - resp.inline_recv_sz = exp_attr.inline_recv_sz; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; - } + return in_len; +} - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) { - resp.max_rss_tbl_sz = exp_attr.max_rss_tbl_sz; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; - } +ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_destroy_srq cmd; + struct ib_uverbs_destroy_srq_resp resp; + struct ib_uobject *uobj; + struct ib_srq *srq; + struct ib_uevent_object *obj; + int ret = -EINVAL; + struct ib_usrq_object *us; + enum ib_srq_type srq_type; - if (copy_to_user(ucore->outbuf, &resp, sizeof(resp))) + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - return ucore->inlen + uhw->inlen; -} + uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); + if (!uobj) + return -EINVAL; + srq = uobj->object; + obj = container_of(uobj, struct ib_uevent_object, uobject); + srq_type = srq->srq_type; -ssize_t ib_uverbs_exp_create_qp(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) -{ - struct ib_uqp_object *obj; - struct ib_device *device; - struct ib_pd *pd = NULL; - struct ib_xrcd *xrcd = NULL; - struct ib_uobject *uninitialized_var(xrcd_uobj); - struct ib_cq *scq = NULL, *rcq = NULL; - struct ib_srq *srq = NULL; - struct ib_qp *qp; - struct ib_exp_qp_init_attr attr; - int ret; - struct ib_uverbs_exp_create_qp cmd_exp; - struct ib_uverbs_exp_create_qp_resp resp_exp; - struct ib_qp *parentqp = NULL; + ret = ib_destroy_srq(srq); + if (!ret) + uobj->live = 0; - memset(&cmd_exp, 0, sizeof(cmd_exp)); + put_uobj_write(uobj); - ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp)); if (ret) return ret; - if (!disable_raw_qp_enforcement && - cmd_exp.qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, - PRIV_NET_RAW)) - return -EPERM; + if (srq_type == IB_SRQT_XRC) { + us = container_of(obj, struct ib_usrq_object, uevent); + atomic_dec(&us->uxrcd->refcnt); + } - obj = kzalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - init_uobj(&obj->uevent.uobject, cmd_exp.user_handle, file->ucontext, - &qp_lock_class); - down_write(&obj->uevent.uobject.mutex); + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); - if (cmd_exp.qp_type == IB_QPT_XRC_TGT) { - xrcd = idr_read_xrcd(cmd_exp.pd_handle, file->ucontext, &xrcd_uobj); - if (!xrcd) { - ret = -EINVAL; - goto err_put; - } - device = xrcd->device; - } else { - if (cmd_exp.qp_type == IB_QPT_XRC_INI) { - cmd_exp.max_recv_wr = 0; - cmd_exp.max_recv_sge = 0; - } else { - if (cmd_exp.is_srq) { - srq = idr_read_srq(cmd_exp.srq_handle, file->ucontext); - if (!srq || srq->srq_type != IB_SRQT_BASIC) { - ret = -EINVAL; - goto err_put; - } - } + ib_uverbs_release_uevent(file, obj); - if (cmd_exp.recv_cq_handle != cmd_exp.send_cq_handle) { - rcq = idr_read_cq(cmd_exp.recv_cq_handle, file->ucontext, 0); - if (!rcq) { - ret = -EINVAL; - goto err_put; - } - } - } + memset(&resp, 0, sizeof resp); + resp.events_reported = obj->events_reported; - scq = idr_read_cq(cmd_exp.send_cq_handle, file->ucontext, !!rcq); - rcq = rcq ?: scq; - pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext); - if (!pd || !scq) { - ret = -EINVAL; - goto err_put; - } + put_uobj(uobj); - device = pd->device; - } + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; - memset(&attr, 0, sizeof(attr)); - attr.event_handler = ib_uverbs_qp_event_handler; - attr.qp_context = file; - attr.send_cq = scq; - attr.recv_cq = rcq; - attr.srq = srq; - attr.xrcd = xrcd; - attr.sq_sig_type = cmd_exp.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; - attr.qp_type = cmd_exp.qp_type; - attr.create_flags = 0; + return ret ? ret : in_len; +} - attr.cap.max_send_wr = cmd_exp.max_send_wr; - attr.cap.max_recv_wr = cmd_exp.max_recv_wr; - attr.cap.max_send_sge = cmd_exp.max_send_sge; - attr.cap.max_recv_sge = cmd_exp.max_recv_sge; - attr.cap.max_inline_data = cmd_exp.max_inline_data; - - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS) - attr.create_flags |= cmd_exp.qp_cap_flags & - (IB_QP_CREATE_CROSS_CHANNEL | - IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV); - - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) { - struct ib_uverbs_qpg *qpg; - if (cmd_exp.qp_type != IB_QPT_RAW_PACKET && - cmd_exp.qp_type != IB_QPT_UD) { - ret = -EINVAL; - goto err_put; - } - qpg = &cmd_exp.qpg; - switch (qpg->qpg_type) { - case IB_QPG_PARENT: - attr.parent_attrib.rss_child_count = - qpg->parent_attrib.rss_child_count; - attr.parent_attrib.tss_child_count = - qpg->parent_attrib.tss_child_count; - break; - case IB_QPG_CHILD_RX: - case IB_QPG_CHILD_TX: - parentqp = idr_read_qp(qpg->parent_handle, - file->ucontext); - if (!parentqp) { - ret = -EINVAL; - goto err_put; - } - attr.qpg_parent = parentqp; - break; - default: - ret = -EINVAL; - goto err_put; - } - attr.qpg_type = qpg->qpg_type; - } +int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_device *ib_dev, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_query_device_resp resp = { {0} }; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr = {0}; + int err; - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) - attr.max_inl_recv = cmd_exp.max_inl_recv; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; - obj->uevent.events_reported = 0; - INIT_LIST_HEAD(&obj->uevent.event_list); - INIT_LIST_HEAD(&obj->mcast_list); + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; - if (cmd_exp.qp_type == IB_QPT_XRC_TGT) - qp = ib_create_qp(pd, (struct ib_qp_init_attr *)&attr); - else - qp = device->exp_create_qp(pd, &attr, uhw); + if (cmd.comp_mask) + return -EINVAL; - if (IS_ERR(qp)) { - ret = PTR_ERR(qp); - goto err_put; - } + if (cmd.reserved) + return -EINVAL; - if (cmd_exp.qp_type != IB_QPT_XRC_TGT) { - qp->real_qp = qp; - qp->device = device; - qp->pd = pd; - qp->send_cq = attr.send_cq; - qp->recv_cq = attr.recv_cq; - qp->srq = attr.srq; - qp->event_handler = attr.event_handler; - qp->qp_context = attr.qp_context; - qp->qp_type = attr.qp_type; - atomic_set(&qp->usecnt, 0); - atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); - if (attr.recv_cq) - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); - } - qp->uobject = &obj->uevent.uobject; + resp.response_length = offsetof(typeof(resp), odp_caps); - obj->uevent.uobject.object = qp; - ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); - if (ret) - goto err_destroy; + if (ucore->outlen < resp.response_length) + return -ENOSPC; - memset(&resp_exp, 0, sizeof(resp_exp)); - resp_exp.qpn = qp->qp_num; - resp_exp.qp_handle = obj->uevent.uobject.id; - resp_exp.max_recv_sge = attr.cap.max_recv_sge; - resp_exp.max_send_sge = attr.cap.max_send_sge; - resp_exp.max_recv_wr = attr.cap.max_recv_wr; - resp_exp.max_send_wr = attr.cap.max_send_wr; - resp_exp.max_inline_data = attr.cap.max_inline_data; + err = ib_dev->query_device(ib_dev, &attr, uhw); + if (err) + return err; - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) { - resp_exp.comp_mask |= IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV; - resp_exp.max_inl_recv = attr.max_inl_recv; - } + copy_query_dev_fields(file, ib_dev, &resp.base, &attr); - ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp)); - if (ret) - goto err_copy; + if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) + goto end; - if (xrcd) { - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); - atomic_inc(&obj->uxrcd->refcnt); - put_xrcd_read(xrcd_uobj); - } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + resp.odp_caps.general_caps = attr.odp_caps.general_caps; + resp.odp_caps.per_transport_caps.rc_odp_caps = + attr.odp_caps.per_transport_caps.rc_odp_caps; + resp.odp_caps.per_transport_caps.uc_odp_caps = + attr.odp_caps.per_transport_caps.uc_odp_caps; + resp.odp_caps.per_transport_caps.ud_odp_caps = + attr.odp_caps.per_transport_caps.ud_odp_caps; +#endif + resp.response_length += sizeof(resp.odp_caps); - if (pd) - put_pd_read(pd); - if (scq) - put_cq_read(scq); - if (rcq && rcq != scq) - put_cq_read(rcq); - if (srq) - put_srq_read(srq); - if (parentqp) - put_qp_read(parentqp); + if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask)) + goto end; - mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); - mutex_unlock(&file->mutex); + resp.timestamp_mask = attr.timestamp_mask; + resp.response_length += sizeof(resp.timestamp_mask); - obj->uevent.uobject.live = 1; + if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock)) + goto end; - up_write(&obj->uevent.uobject.mutex); + resp.hca_core_clock = attr.hca_core_clock; + resp.response_length += sizeof(resp.hca_core_clock); - return ucore->inlen + uhw->inlen; + if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex)) + goto end; -err_copy: - idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); + resp.device_cap_flags_ex = attr.device_cap_flags; + resp.response_length += sizeof(resp.device_cap_flags_ex); -err_destroy: - ib_destroy_qp(qp); + if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps)) + goto end; -err_put: - if (xrcd) - put_xrcd_read(xrcd_uobj); - if (pd) - put_pd_read(pd); - if (scq) - put_cq_read(scq); - if (rcq && rcq != scq) - put_cq_read(rcq); - if (srq) - put_srq_read(srq); - if (parentqp) - put_qp_read(parentqp); + resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts; + resp.rss_caps.max_rwq_indirection_tables = + attr.rss_caps.max_rwq_indirection_tables; + resp.rss_caps.max_rwq_indirection_table_size = + attr.rss_caps.max_rwq_indirection_table_size; - put_uobj_write(&obj->uevent.uobject); - return ret; -} + resp.response_length += sizeof(resp.rss_caps); -int ib_exp_query_device(struct ib_device *device, - struct ib_exp_device_attr *device_attr) -{ - return device->exp_query_device(device, device_attr); + if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq)) + goto end; + + resp.max_wq_type_rq = attr.max_wq_type_rq; + resp.response_length += sizeof(resp.max_wq_type_rq); +end: + err = ib_copy_to_udata(ucore, &resp, resp.response_length); + return err; } -EXPORT_SYMBOL(ib_exp_query_device); Index: sys/ofed/drivers/infiniband/core/ib_uverbs_main.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_uverbs_main.c +++ sys/ofed/drivers/infiniband/core/ib_uverbs_main.c @@ -43,12 +43,11 @@ #include #include #include -#include -#include -#include #include +#include + #include "uverbs.h" MODULE_AUTHOR("Roland Dreier"); @@ -63,31 +62,6 @@ #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) -static int uverbs_copy_from_udata_ex(void *dest, struct ib_udata *udata, size_t len) -{ - return copy_from_user(dest, udata->inbuf, min(udata->inlen, len)) ? -EFAULT : 0; -} - -static int uverbs_copy_to_udata_ex(struct ib_udata *udata, void *src, size_t len) -{ - return copy_to_user(udata->outbuf, src, min(udata->outlen, len)) ? -EFAULT : 0; -} - -static struct ib_udata_ops uverbs_copy_ex = { - .copy_from = uverbs_copy_from_udata_ex, - .copy_to = uverbs_copy_to_udata_ex -}; - -#define INIT_UDATA_EX(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->ops = &uverbs_copy_ex; \ - (udata)->inbuf = (void __user *)(unsigned long)(ibuf); \ - (udata)->outbuf = (void __user *)(unsigned long)(obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) - - static struct class *uverbs_class; DEFINE_SPINLOCK(ib_uverbs_idr_lock); @@ -100,44 +74,47 @@ DEFINE_IDR(ib_uverbs_srq_idr); DEFINE_IDR(ib_uverbs_xrcd_idr); DEFINE_IDR(ib_uverbs_rule_idr); -DEFINE_IDR(ib_uverbs_dct_idr); +DEFINE_IDR(ib_uverbs_wq_idr); +DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr); static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, + struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) = { - [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, - [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, - [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, - [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, - [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, - [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, - [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, + [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, + [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, + [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, + [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, + [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, + [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, + [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr, + [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel, - [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, - [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, - [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, - [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, - [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, - [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, - [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, - [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, - [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, - [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, - [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, - [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, - [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, - [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, - [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, - [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, - [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, - [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, - [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, - [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, + [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, + [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, + [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, + [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, + [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, + [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, + [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, + [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, + [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, + [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, + [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, + [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, + [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, + [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, + [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, + [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, + [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, + [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, + [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, @@ -145,36 +122,48 @@ }; static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, -}; - -static ssize_t (*uverbs_exp_cmd_table[])(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) = { - [IB_USER_VERBS_EXP_CMD_CREATE_QP] = ib_uverbs_exp_create_qp, - [IB_USER_VERBS_EXP_CMD_MODIFY_CQ] = ib_uverbs_exp_modify_cq, - [IB_USER_VERBS_EXP_CMD_MODIFY_QP] = ib_uverbs_exp_modify_qp, - [IB_USER_VERBS_EXP_CMD_CREATE_CQ] = ib_uverbs_exp_create_cq, - [IB_USER_VERBS_EXP_CMD_QUERY_DEVICE] = ib_uverbs_exp_query_device, - [IB_USER_VERBS_EXP_CMD_CREATE_DCT] = ib_uverbs_exp_create_dct, - [IB_USER_VERBS_EXP_CMD_DESTROY_DCT] = ib_uverbs_exp_destroy_dct, - [IB_USER_VERBS_EXP_CMD_QUERY_DCT] = ib_uverbs_exp_query_dct, + [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device, + [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq, + [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp, + [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq, + [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq, + [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq, + [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table, + [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table, }; static void ib_uverbs_add_one(struct ib_device *device); -static void ib_uverbs_remove_one(struct ib_device *device); +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); -static void ib_uverbs_release_dev(struct kref *ref) +int uverbs_dealloc_mw(struct ib_mw *mw) +{ + struct ib_pd *pd = mw->pd; + int ret; + + ret = mw->device->dealloc_mw(mw); + if (!ret) + atomic_dec(&pd->usecnt); + return ret; +} + +static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = - container_of(ref, struct ib_uverbs_device, ref); + container_of(kobj, struct ib_uverbs_device, kobj); - complete(&dev->comp); + cleanup_srcu_struct(&dev->disassociate_srcu); + kfree(dev); } +static struct kobj_type ib_uverbs_dev_ktype = { + .release = ib_uverbs_release_dev, +}; + static void ib_uverbs_release_event_file(struct kref *ref) { struct ib_uverbs_event_file *file = @@ -237,10 +226,6 @@ struct ib_ucontext *context) { struct ib_uobject *uobj, *tmp; - int err; - - if (!context) - return 0; context->closing = 1; @@ -257,13 +242,10 @@ struct ib_mw *mw = uobj->object; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - err = ib_dealloc_mw(mw); - if (err) { - pr_info("user_verbs: couldn't deallocate MW during cleanup.\n"); - pr_info("user_verbs: the system may have become unstable.\n"); - } + uverbs_dealloc_mw(mw); kfree(uobj); } + list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) { struct ib_flow *flow_id = uobj->object; @@ -278,28 +260,32 @@ container_of(uobj, struct ib_uqp_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - - ib_uverbs_detach_umcast(qp, uqp); - err = ib_destroy_qp(qp); - if (err) - pr_info("destroying uverbs qp failed: err %d\n", err); - + if (qp == qp->real_qp) + ib_uverbs_detach_umcast(qp, uqp); + ib_destroy_qp(qp); ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } - list_for_each_entry_safe(uobj, tmp, &context->dct_list, list) { - struct ib_dct *dct = uobj->object; - struct ib_udct_object *udct = - container_of(uobj, struct ib_udct_object, uobject); + list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) { + struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object; + struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; - idr_remove_uobj(&ib_uverbs_dct_idr, uobj); + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + ib_destroy_rwq_ind_table(rwq_ind_tbl); + kfree(ind_tbl); + kfree(uobj); + } - err = ib_destroy_dct(dct); - if (err) - pr_info("destroying uverbs dct failed: err %d\n", err); + list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) { + struct ib_wq *wq = uobj->object; + struct ib_uwq_object *uwq = + container_of(uobj, struct ib_uwq_object, uevent.uobject); - kfree(udct); + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); + ib_destroy_wq(wq); + ib_uverbs_release_uevent(file, &uwq->uevent); + kfree(uwq); } list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { @@ -308,9 +294,7 @@ container_of(uobj, struct ib_uevent_object, uobject); idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - err = ib_destroy_srq(srq); - if (err) - pr_info("destroying uverbs srq failed: err %d\n", err); + ib_destroy_srq(srq); ib_uverbs_release_uevent(file, uevent); kfree(uevent); } @@ -322,10 +306,7 @@ container_of(uobj, struct ib_ucq_object, uobject); idr_remove_uobj(&ib_uverbs_cq_idr, uobj); - err = ib_destroy_cq(cq); - if (err) - pr_info("destroying uverbs cq failed: err %d\n", err); - + ib_destroy_cq(cq); ib_uverbs_release_ucq(file, ev_file, ucq); kfree(ucq); } @@ -334,11 +315,7 @@ struct ib_mr *mr = uobj->object; idr_remove_uobj(&ib_uverbs_mr_idr, uobj); - err = ib_dereg_mr(mr); - if (err) { - pr_info("user_verbs: couldn't deregister an MR during cleanup.\n"); - pr_info("user_verbs: the system may have become unstable.\n"); - } + ib_dereg_mr(mr); kfree(uobj); } @@ -362,16 +339,32 @@ kfree(uobj); } + put_pid(context->tgid); + return context->device->dealloc_ucontext(context); } +static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) +{ + complete(&dev->comp); +} + static void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); + struct ib_device *ib_dev; + int srcu_key; - module_put(file->device->ib_dev->owner); - kref_put(&file->device->ref, ib_uverbs_release_dev); + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (ib_dev && !ib_dev->disassociate_ucontext) + module_put(ib_dev->owner); + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + + if (atomic_dec_and_test(&file->device->refcount)) + ib_uverbs_comp_dev(file->device); kfree(file); } @@ -393,9 +386,19 @@ return -EAGAIN; if (wait_event_interruptible(file->poll_wait, - !list_empty(&file->event_list))) + (!list_empty(&file->event_list) || + /* The barriers built into wait_event_interruptible() + * and wake_up() guarentee this will see the null set + * without using RCU + */ + !file->uverbs_file->device->ib_dev))) return -ERESTARTSYS; + /* If device was disassociated and no event exists set an error */ + if (list_empty(&file->event_list) && + !file->uverbs_file->device->ib_dev) + return -EIO; + spin_lock_irq(&file->lock); } @@ -437,7 +440,6 @@ unsigned int pollflags = 0; struct ib_uverbs_event_file *file = filp->private_data; - file->filp = filp; poll_wait(filp, &file->poll_wait, wait); spin_lock_irq(&file->lock); @@ -459,8 +461,11 @@ { struct ib_uverbs_event_file *file = filp->private_data; struct ib_uverbs_event *entry, *tmp; + int closed_already = 0; + mutex_lock(&file->uverbs_file->device->lists_mutex); spin_lock_irq(&file->lock); + closed_already = file->is_closed; file->is_closed = 1; list_for_each_entry_safe(entry, tmp, &file->event_list, list) { if (entry->counter) @@ -468,11 +473,15 @@ kfree(entry); } spin_unlock_irq(&file->lock); - - if (file->is_async) { - ib_unregister_event_handler(&file->uverbs_file->event_handler); - kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); + if (!closed_already) { + list_del(&file->list); + if (file->is_async) + ib_unregister_event_handler(&file->uverbs_file-> + event_handler); } + mutex_unlock(&file->uverbs_file->device->lists_mutex); + + kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); kref_put(&file->ref, ib_uverbs_release_event_file); return 0; @@ -480,7 +489,7 @@ static const struct file_operations uverbs_event_fops = { .owner = THIS_MODULE, - .read = ib_uverbs_event_read, + .read = ib_uverbs_event_read, .poll = ib_uverbs_event_poll, .release = ib_uverbs_event_close, .fasync = ib_uverbs_event_fasync, @@ -519,8 +528,6 @@ spin_unlock_irqrestore(&file->lock, flags); wake_up_interruptible(&file->poll_wait); - if (file->filp) - selwakeup(&file->filp->f_selinfo); kill_fasync(&file->async_queue, SIGIO, POLL_IN); } @@ -546,6 +553,7 @@ entry->desc.async.element = element; entry->desc.async.event_type = event; + entry->desc.async.reserved = 0; entry->counter = counter; list_add_tail(&entry->list, &file->async_file->event_list); @@ -554,8 +562,6 @@ spin_unlock_irqrestore(&file->async_file->lock, flags); wake_up_interruptible(&file->async_file->poll_wait); - if (file->async_file->filp) - selwakeup(&file->async_file->filp->f_selinfo); kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN); } @@ -573,6 +579,10 @@ { struct ib_uevent_object *uobj; + /* for XRC target qp's, check that qp is live */ + if (!event->element.qp->uobject || !event->element.qp->uobject->live) + return; + uobj = container_of(event->element.qp->uobject, struct ib_uevent_object, uobject); @@ -581,6 +591,16 @@ &uobj->events_reported); } +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr) +{ + struct ib_uevent_object *uobj = container_of(event->element.wq->uobject, + struct ib_uevent_object, uobject); + + ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, + event->event, &uobj->event_list, + &uobj->events_reported); +} + void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) { struct ib_uevent_object *uobj; @@ -603,13 +623,21 @@ NULL, NULL); } +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file) +{ + kref_put(&file->async_file->ref, ib_uverbs_release_event_file); + file->async_file = NULL; +} + struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev, int is_async) { struct ib_uverbs_event_file *ev_file; struct file *filp; + int ret; - ev_file = kzalloc(sizeof *ev_file, GFP_KERNEL); + ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); if (!ev_file) return ERR_PTR(-ENOMEM); @@ -618,22 +646,50 @@ INIT_LIST_HEAD(&ev_file->event_list); init_waitqueue_head(&ev_file->poll_wait); ev_file->uverbs_file = uverbs_file; - ev_file->is_async = is_async; - - /* - * fops_get() can't fail here, because we're coming from a - * system call on a uverbs file, which will already have a - * module reference. - */ - filp = alloc_file(FMODE_READ, fops_get(&uverbs_event_fops)); + kref_get(&ev_file->uverbs_file->ref); + ev_file->async_queue = NULL; + ev_file->is_closed = 0; + +#ifdef __linux__ + filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops, + ev_file, O_RDONLY); +#else + filp = ERR_PTR(-EOPNOTSUPP); +#endif + if (IS_ERR(filp)) + goto err_put_refs; + mutex_lock(&uverbs_file->device->lists_mutex); + list_add_tail(&ev_file->list, + &uverbs_file->device->uverbs_events_file_list); + mutex_unlock(&uverbs_file->device->lists_mutex); + + if (is_async) { + WARN_ON(uverbs_file->async_file); + uverbs_file->async_file = ev_file; + kref_get(&uverbs_file->async_file->ref); + INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, + ib_dev, + ib_uverbs_event_handler); + ret = ib_register_event_handler(&uverbs_file->event_handler); + if (ret) + goto err_put_file; - if (IS_ERR(filp)) { - kfree(ev_file); - } else { - filp->private_data = ev_file; + /* At that point async file stuff was fully set */ + ev_file->is_async = 1; } return filp; + +err_put_file: + fput(filp); + kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file); + uverbs_file->async_file = NULL; + return ERR_PTR(ret); + +err_put_refs: + kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); + kref_put(&ev_file->ref, ib_uverbs_release_event_file); + return filp; } /* @@ -665,221 +721,34 @@ return ev_file; } -static const char *verbs_cmd_str(__u32 cmd) -{ - switch (cmd) { - case IB_USER_VERBS_CMD_GET_CONTEXT: - return "GET_CONTEXT"; - case IB_USER_VERBS_CMD_QUERY_DEVICE: - return "QUERY_DEVICE"; - case IB_USER_VERBS_CMD_QUERY_PORT: - return "QUERY_PORT"; - case IB_USER_VERBS_CMD_ALLOC_PD: - return "ALLOC_PD"; - case IB_USER_VERBS_CMD_DEALLOC_PD: - return "DEALLOC_PD"; - case IB_USER_VERBS_CMD_REG_MR: - return "REG_MR"; - case IB_USER_VERBS_CMD_DEREG_MR: - return "DEREG_MR"; - case IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL: - return "CREATE_COMP_CHANNEL"; - case IB_USER_VERBS_CMD_CREATE_CQ: - return "CREATE_CQ"; - case IB_USER_VERBS_CMD_RESIZE_CQ: - return "RESIZE_CQ"; - case IB_USER_VERBS_CMD_POLL_CQ: - return "POLL_CQ"; - case IB_USER_VERBS_CMD_REQ_NOTIFY_CQ: - return "REQ_NOTIFY_CQ"; - case IB_USER_VERBS_CMD_DESTROY_CQ: - return "DESTROY_CQ"; - case IB_USER_VERBS_CMD_CREATE_QP: - return "CREATE_QP"; - case IB_USER_VERBS_CMD_QUERY_QP: - return "QUERY_QP"; - case IB_USER_VERBS_CMD_MODIFY_QP: - return "MODIFY_QP"; - case IB_USER_VERBS_CMD_DESTROY_QP: - return "DESTROY_QP"; - case IB_USER_VERBS_CMD_POST_SEND: - return "POST_SEND"; - case IB_USER_VERBS_CMD_POST_RECV: - return "POST_RECV"; - case IB_USER_VERBS_CMD_POST_SRQ_RECV: - return "POST_SRQ_RECV"; - case IB_USER_VERBS_CMD_CREATE_AH: - return "CREATE_AH"; - case IB_USER_VERBS_CMD_DESTROY_AH: - return "DESTROY_AH"; - case IB_USER_VERBS_CMD_ATTACH_MCAST: - return "ATTACH_MCAST"; - case IB_USER_VERBS_CMD_DETACH_MCAST: - return "DETACH_MCAST"; - case IB_USER_VERBS_CMD_CREATE_SRQ: - return "CREATE_SRQ"; - case IB_USER_VERBS_CMD_MODIFY_SRQ: - return "MODIFY_SRQ"; - case IB_USER_VERBS_CMD_QUERY_SRQ: - return "QUERY_SRQ"; - case IB_USER_VERBS_CMD_DESTROY_SRQ: - return "DESTROY_SRQ"; - case IB_USER_VERBS_CMD_OPEN_XRCD: - return "OPEN_XRCD"; - case IB_USER_VERBS_CMD_CLOSE_XRCD: - return "CLOSE_XRCD"; - case IB_USER_VERBS_CMD_CREATE_XSRQ: - return "CREATE_XSRQ"; - case IB_USER_VERBS_CMD_OPEN_QP: - return "OPEN_QP"; - } - - return "Unknown command"; -} - -enum { - COMMAND_INFO_MASK = 0x1000, -}; - -static ssize_t ib_uverbs_exp_handle_cmd(struct ib_uverbs_file *file, - const char __user *buf, - struct ib_device *dev, - struct ib_uverbs_cmd_hdr *hdr, - size_t count, - int legacy_ex_cmd) +static int verify_command_mask(struct ib_device *ib_dev, __u32 command) { - struct ib_udata ucore; - struct ib_udata uhw; - struct ib_uverbs_ex_cmd_hdr ex_hdr; - __u32 command = hdr->command - IB_USER_VERBS_EXP_CMD_FIRST; - - if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; - - if (command >= ARRAY_SIZE(uverbs_exp_cmd_table) || - !uverbs_exp_cmd_table[command]) - return -EINVAL; - - if (!file->ucontext) - return -EINVAL; - - if (!(dev->uverbs_exp_cmd_mask & (1ull << command))) - return -ENOSYS; - - if (legacy_ex_cmd) { - struct ib_uverbs_ex_cmd_hdr_legacy hxl; - struct ib_uverbs_ex_cmd_resp1_legacy resp1; - __u64 response; - ssize_t ret; - - if (count < sizeof(hxl)) - return -EINVAL; - - if (copy_from_user(&hxl, buf, sizeof(hxl))) - return -EFAULT; - - if (((hxl.in_words + hxl.provider_in_words) * 4) != count) - return -EINVAL; - - count -= sizeof(hxl); - buf += sizeof(hxl); - if (hxl.out_words || hxl.provider_out_words) { - if (count < sizeof(resp1)) - return -EINVAL; - if (copy_from_user(&resp1, buf, sizeof(resp1))) - return -EFAULT; - response = resp1.response; - if (!response) - return -EINVAL; - - /* - * Change user buffer to comply with new extension format. - */ - if (sizeof(resp1.comp_mask) != sizeof(resp1.response)) - return -EFAULT; - buf += sizeof(resp1.comp_mask); - if (copy_to_user(__DECONST(void __user *, buf), &resp1.comp_mask, - sizeof(resp1.response))) - return -EFAULT; - - } else { - response = 0; - } - - INIT_UDATA_EX(&ucore, - (hxl.in_words) ? buf : 0, - response, - hxl.in_words * 4, - hxl.out_words * 4); - - INIT_UDATA_EX(&uhw, - (hxl.provider_in_words) ? buf + ucore.inlen : 0, - (hxl.provider_out_words) ? response + ucore.outlen : 0, - hxl.provider_in_words * 4, - hxl.provider_out_words * 4); - - ret = uverbs_exp_cmd_table[command](file, &ucore, &uhw); - /* - * UnChange user buffer - */ - if (response && copy_to_user(__DECONST(void __user *, buf), &resp1.response, sizeof(resp1.response))) - return -EFAULT; - - return ret; - } else { - if (count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; - - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) - return -EFAULT; - - buf += sizeof(hdr) + sizeof(ex_hdr); - - if ((hdr->in_words + ex_hdr.provider_in_words) * 8 != count) - return -EINVAL; + u64 mask; - if (ex_hdr.response) { - if (!hdr->out_words && !ex_hdr.provider_out_words) - return -EINVAL; - } else { - if (hdr->out_words || ex_hdr.provider_out_words) - return -EINVAL; - } - - INIT_UDATA_EX(&ucore, - (hdr->in_words) ? buf : 0, - (unsigned long)ex_hdr.response, - hdr->in_words * 8, - hdr->out_words * 8); + if (command <= IB_USER_VERBS_CMD_OPEN_QP) + mask = ib_dev->uverbs_cmd_mask; + else + mask = ib_dev->uverbs_ex_cmd_mask; - INIT_UDATA_EX(&uhw, - (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, - (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, - ex_hdr.provider_in_words * 8, - ex_hdr.provider_out_words * 8); + if (mask & ((u64)1 << command)) + return 0; - return uverbs_exp_cmd_table[command](file, &ucore, &uhw); - } + return -1; } static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; - struct ib_device *dev = file->device->ib_dev; + struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; - struct timespec ts1; - struct timespec ts2; - ktime_t t1, t2, delta; - s64 ds; - ssize_t ret; - u64 dividend; - u32 divisor; - __u32 flags; __u32 command; - int legacy_ex_cmd = 0; - size_t written_count = count; + __u32 flags; + int srcu_key; + ssize_t ret; + + if (WARN_ON_ONCE(!ib_safe_file_access(filp))) + return -EACCES; if (count < sizeof hdr) return -EINVAL; @@ -887,171 +756,157 @@ if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - /* - * For BWD compatibility change old style extension verbs commands - * to their equivalent experimental command. - */ - if ((hdr.command >= IB_USER_VERBS_LEGACY_CMD_FIRST) && - (hdr.command <= IB_USER_VERBS_LEGACY_EX_CMD_LAST)) { - hdr.command += IB_USER_VERBS_EXP_CMD_FIRST - - IB_USER_VERBS_LEGACY_CMD_FIRST; - legacy_ex_cmd = 1; + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto out; + } + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) { + ret = -EINVAL; + goto out; + } + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + if (verify_command_mask(ib_dev, command)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) { + ret = -EINVAL; + goto out; } flags = (hdr.command & IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - ktime_get_ts(&ts1); - if (!flags && (command >= IB_USER_VERBS_EXP_CMD_FIRST)) { - ret = ib_uverbs_exp_handle_cmd(file, buf, dev, &hdr, count, legacy_ex_cmd); - } else if (!flags) { + if (!flags) { if (command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[command]) - return -EINVAL; - - if (!file->ucontext && - command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + !uverbs_cmd_table[command]) { + ret = -EINVAL; + goto out; + } - if (!(dev->uverbs_cmd_mask & (1ull << command))) - return -ENOSYS; + if (hdr.in_words * 4 != count) { + ret = -EINVAL; + goto out; + } - if (hdr.in_words * 4 != count) - return -EINVAL; + ret = uverbs_cmd_table[command](file, ib_dev, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); - ret = uverbs_cmd_table[command](file, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; - struct ib_uverbs_ex_cmd_hdr ex_hdr; - - if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; + size_t written_count = count; if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || - !uverbs_ex_cmd_table[command]) - return -EINVAL; - - if (!file->ucontext) - return -EINVAL; + !uverbs_ex_cmd_table[command]) { + ret = -ENOSYS; + goto out; + } - if (!(dev->uverbs_ex_cmd_mask & (1ull << command))) - return -ENOSYS; + if (!file->ucontext) { + ret = -EINVAL; + goto out; + } - if (count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; + if (count < (sizeof(hdr) + sizeof(ex_hdr))) { + ret = -EINVAL; + goto out; + } - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) - return -EFAULT; + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { + ret = -EFAULT; + goto out; + } count -= sizeof(hdr) + sizeof(ex_hdr); buf += sizeof(hdr) + sizeof(ex_hdr); - if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) - return -EINVAL; + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) { + ret = -EINVAL; + goto out; + } + + if (ex_hdr.cmd_hdr_reserved) { + ret = -EINVAL; + goto out; + } if (ex_hdr.response) { - if (!hdr.out_words && !ex_hdr.provider_out_words) - return -EINVAL; + if (!hdr.out_words && !ex_hdr.provider_out_words) { + ret = -EINVAL; + goto out; + } + + if (!access_ok(VERIFY_WRITE, + (void __user *) (unsigned long) ex_hdr.response, + (hdr.out_words + ex_hdr.provider_out_words) * 8)) { + ret = -EFAULT; + goto out; + } } else { - if (hdr.out_words || ex_hdr.provider_out_words) - return -EINVAL; + if (hdr.out_words || ex_hdr.provider_out_words) { + ret = -EINVAL; + goto out; + } } - INIT_UDATA_EX(&ucore, - (hdr.in_words) ? buf : 0, - (unsigned long)ex_hdr.response, - hdr.in_words * 8, - hdr.out_words * 8); - - INIT_UDATA_EX(&uhw, - (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, - (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, - ex_hdr.provider_in_words * 8, - ex_hdr.provider_out_words * 8); - - ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw); - - if (ret) - return ret; - - return written_count; - + INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, + hdr.in_words * 8, hdr.out_words * 8); + + INIT_UDATA_BUF_OR_NULL(&uhw, + buf + ucore.inlen, + (unsigned long) ex_hdr.response + ucore.outlen, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + ret = uverbs_ex_cmd_table[command](file, + ib_dev, + &ucore, + &uhw); + if (!ret) + ret = written_count; } else { - return -EFAULT; + ret = -ENOSYS; } - if ((dev->cmd_perf & (COMMAND_INFO_MASK - 1)) == hdr.command) { - ktime_get_ts(&ts2); - t1 = timespec_to_ktime(ts1); - t2 = timespec_to_ktime(ts2); - delta = ktime_sub(t2, t1); - ds = ktime_to_ns(delta); - spin_lock(&dev->cmd_perf_lock); - dividend = dev->cmd_avg * dev->cmd_n + ds; - ++dev->cmd_n; - divisor = dev->cmd_n; - do_div(dividend, divisor); - dev->cmd_avg = dividend; - spin_unlock(&dev->cmd_perf_lock); - if (dev->cmd_perf & COMMAND_INFO_MASK) { - pr_info("%s: %s execution time = %lld nsec\n", - file->device->ib_dev->name, - verbs_cmd_str(hdr.command), - (long long)ds); - } - } +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) { struct ib_uverbs_file *file = filp->private_data; + struct ib_device *ib_dev; + int ret = 0; + int srcu_key; - if (!file->ucontext) - return -ENODEV; - else - return file->device->ib_dev->mmap(file->ucontext, vma); -} -/* XXX Not supported in FreeBSD */ -#if 0 -static unsigned long ib_uverbs_get_unmapped_area(struct file *filp, - unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct ib_uverbs_file *file = filp->private_data; - - if (!file->ucontext) - return -ENODEV; - else { - if (!file->device->ib_dev->get_unmapped_area) - return current->mm->get_unmapped_area(filp, addr, len, - pgoff, flags); - - return file->device->ib_dev->get_unmapped_area(filp, addr, len, - pgoff, flags); + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto out; } -} -#endif - -static long ib_uverbs_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct ib_uverbs_file *file = filp->private_data; - - if (!file->device->ib_dev->ioctl) - return -ENOTSUPP; if (!file->ucontext) - return -ENODEV; + ret = -ENODEV; else - /* provider should provide it's own locking mechanism */ - return file->device->ib_dev->ioctl(file->ucontext, cmd, arg); + ret = ib_dev->mmap(file->ucontext, vma); +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return ret; } /* @@ -1068,23 +923,43 @@ { struct ib_uverbs_device *dev; struct ib_uverbs_file *file; + struct ib_device *ib_dev; int ret; + int module_dependent; + int srcu_key; dev = container_of(inode->i_cdev->si_drv1, struct ib_uverbs_device, cdev); - if (dev) - kref_get(&dev->ref); - else + if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; - if (!try_module_get(dev->ib_dev->owner)) { - ret = -ENODEV; + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + mutex_lock(&dev->lists_mutex); + ib_dev = srcu_dereference(dev->ib_dev, + &dev->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; goto err; } - file = kmalloc(sizeof *file, GFP_KERNEL); + /* In case IB device supports disassociate ucontext, there is no hard + * dependency between uverbs device and its low level device. + */ + module_dependent = !(ib_dev->disassociate_ucontext); + + if (module_dependent) { + if (!try_module_get(ib_dev->owner)) { + ret = -ENODEV; + goto err; + } + } + + file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) { ret = -ENOMEM; - goto err_module; + if (module_dependent) + goto err_module; + + goto err; } file->device = dev; @@ -1092,54 +967,71 @@ file->async_file = NULL; kref_init(&file->ref); mutex_init(&file->mutex); + mutex_init(&file->cleanup_mutex); filp->private_data = file; + kobject_get(&dev->kobj); + list_add_tail(&file->list, &dev->uverbs_file_list); + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return nonseekable_open(inode, filp); err_module: - module_put(dev->ib_dev->owner); + module_put(ib_dev->owner); err: - kref_put(&dev->ref, ib_uverbs_release_dev); + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + if (atomic_dec_and_test(&dev->refcount)) + ib_uverbs_comp_dev(dev); + return ret; } static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_device *dev = file->device; - ib_uverbs_cleanup_ucontext(file, file->ucontext); + mutex_lock(&file->cleanup_mutex); + if (file->ucontext) { + ib_uverbs_cleanup_ucontext(file, file->ucontext); + file->ucontext = NULL; + } + mutex_unlock(&file->cleanup_mutex); + + mutex_lock(&file->device->lists_mutex); + if (!file->is_closed) { + list_del(&file->list); + file->is_closed = 1; + } + mutex_unlock(&file->device->lists_mutex); if (file->async_file) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); kref_put(&file->ref, ib_uverbs_release_file); + kobject_put(&dev->kobj); return 0; } static const struct file_operations uverbs_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, - .open = ib_uverbs_open, + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, - .unlocked_ioctl = ib_uverbs_ioctl, }; static const struct file_operations uverbs_mmap_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, + .owner = THIS_MODULE, + .write = ib_uverbs_write, .mmap = ib_uverbs_mmap, - .open = ib_uverbs_open, + .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, -/* XXX Not supported in FreeBSD */ -#if 0 - .get_unmapped_area = ib_uverbs_get_unmapped_area, -#endif - .unlocked_ioctl = ib_uverbs_ioctl, }; static struct ib_client uverbs_client = { @@ -1151,45 +1043,46 @@ static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, char *buf) { + int ret = -ENODEV; + int srcu_key; struct ib_uverbs_device *dev = dev_get_drvdata(device); + struct ib_device *ib_dev; if (!dev) return -ENODEV; - return sprintf(buf, "%s\n", dev->ib_dev->name); -} -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); - -static ssize_t show_dev_ref_cnt(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_uverbs_device *dev = dev_get_drvdata(device); + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sprintf(buf, "%s\n", ib_dev->name); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); - if (!dev) - return -ENODEV; - - return sprintf(buf, "%d\n", atomic_read(&dev->ref.refcount)); + return ret; } -static DEVICE_ATTR(ref_cnt, S_IRUGO, show_dev_ref_cnt, NULL); +static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static ssize_t show_dev_abi_version(struct device *device, struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = dev_get_drvdata(device); + int ret = -ENODEV; + int srcu_key; + struct ib_device *ib_dev; if (!dev) return -ENODEV; + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); - return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver); + return ret; } static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); -static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION); -} - -static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_VERBS_ABI_VERSION)); static dev_t overflow_maj; static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); @@ -1207,7 +1100,7 @@ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n"); + pr_err("user_verbs: couldn't register dynamic device number\n"); return ret; } } @@ -1218,52 +1111,13 @@ return ret; } -#include - -static ssize_t -show_dev_device(struct device *device, struct device_attribute *attr, char *buf) -{ - struct ib_uverbs_device *dev = dev_get_drvdata(device); - - if (!dev || !dev->ib_dev->dma_device) - return -ENODEV; - - return sprintf(buf, "0x%04x\n", - ((struct pci_dev *)dev->ib_dev->dma_device)->device); -} -static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL); - -static ssize_t -show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf) -{ - struct ib_uverbs_device *dev = dev_get_drvdata(device); - - if (!dev || !dev->ib_dev->dma_device) - return -ENODEV; - - return sprintf(buf, "0x%04x\n", - ((struct pci_dev *)dev->ib_dev->dma_device)->vendor); -} - -static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL); - -struct attribute *device_attrs[] = -{ - &dev_attr_device.attr, - &dev_attr_vendor.attr, - NULL -}; - -static struct attribute_group device_group = { - .name = "device", - .attrs = device_attrs -}; static void ib_uverbs_add_one(struct ib_device *device) { int devnum; dev_t base; struct ib_uverbs_device *uverbs_dev; + int ret; if (!device->alloc_ucontext) return; @@ -1272,10 +1126,20 @@ if (!uverbs_dev) return; - kref_init(&uverbs_dev->ref); + ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); + if (ret) { + kfree(uverbs_dev); + return; + } + + atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); + kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); + mutex_init(&uverbs_dev->lists_mutex); + INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); + INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); @@ -1283,7 +1147,7 @@ spin_unlock(&map_lock); devnum = find_overflow_devnum(); if (devnum < 0) - goto err; + goto err; spin_lock(&map_lock); uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; @@ -1296,12 +1160,13 @@ } spin_unlock(&map_lock); - uverbs_dev->ib_dev = device; + rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj; kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); if (cdev_add(&uverbs_dev->cdev, base, 1)) goto err_cdev; @@ -1314,12 +1179,8 @@ if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) goto err_class; - if (device_create_file(uverbs_dev->dev, &dev_attr_ref_cnt)) - goto err_class; if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; - if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group)) - goto err_class; ib_set_client_data(device, &uverbs_client, uverbs_dev); @@ -1336,32 +1197,122 @@ clear_bit(devnum, overflow_map); err: - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); return; } -static void ib_uverbs_remove_one(struct ib_device *device) +static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, + struct ib_device *ib_dev) +{ + struct ib_uverbs_file *file; + struct ib_uverbs_event_file *event_file; + struct ib_event event; + + /* Pending running commands to terminate */ + synchronize_srcu(&uverbs_dev->disassociate_srcu); + event.event = IB_EVENT_DEVICE_FATAL; + event.element.port_num = 0; + event.device = ib_dev; + + mutex_lock(&uverbs_dev->lists_mutex); + while (!list_empty(&uverbs_dev->uverbs_file_list)) { + struct ib_ucontext *ucontext; + file = list_first_entry(&uverbs_dev->uverbs_file_list, + struct ib_uverbs_file, list); + file->is_closed = 1; + list_del(&file->list); + kref_get(&file->ref); + mutex_unlock(&uverbs_dev->lists_mutex); + + ib_uverbs_event_handler(&file->event_handler, &event); + + mutex_lock(&file->cleanup_mutex); + ucontext = file->ucontext; + file->ucontext = NULL; + mutex_unlock(&file->cleanup_mutex); + + /* At this point ib_uverbs_close cannot be running + * ib_uverbs_cleanup_ucontext + */ + if (ucontext) { + /* We must release the mutex before going ahead and + * calling disassociate_ucontext. disassociate_ucontext + * might end up indirectly calling uverbs_close, + * for example due to freeing the resources + * (e.g mmput). + */ + ib_dev->disassociate_ucontext(ucontext); + ib_uverbs_cleanup_ucontext(file, ucontext); + } + + mutex_lock(&uverbs_dev->lists_mutex); + kref_put(&file->ref, ib_uverbs_release_file); + } + + while (!list_empty(&uverbs_dev->uverbs_events_file_list)) { + event_file = list_first_entry(&uverbs_dev-> + uverbs_events_file_list, + struct ib_uverbs_event_file, + list); + spin_lock_irq(&event_file->lock); + event_file->is_closed = 1; + spin_unlock_irq(&event_file->lock); + + list_del(&event_file->list); + if (event_file->is_async) { + ib_unregister_event_handler(&event_file->uverbs_file-> + event_handler); + event_file->uverbs_file->event_handler.device = NULL; + } + + wake_up_interruptible(&event_file->poll_wait); + kill_fasync(&event_file->async_queue, SIGIO, POLL_IN); + } + mutex_unlock(&uverbs_dev->lists_mutex); +} + +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) { - struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); + struct ib_uverbs_device *uverbs_dev = client_data; + int wait_clients = 1; if (!uverbs_dev) return; - sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group); dev_set_drvdata(uverbs_dev->dev, NULL); device_destroy(uverbs_class, uverbs_dev->cdev.dev); cdev_del(&uverbs_dev->cdev); if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(uverbs_dev->devnum, dev_map); + clear_bit(uverbs_dev->devnum, dev_map); else clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); - wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + if (device->disassociate_ucontext) { + /* We disassociate HW resources and immediately return. + * Userspace will see a EIO errno for all future access. + * Upon returning, ib_device may be freed internally and is not + * valid any more. + * uverbs_device is still available until all clients close + * their files, then the uverbs device ref count will be zero + * and its resources will be freed. + * Note: At this point no more files can be opened since the + * cdev was deleted, however active clients can still issue + * commands and close their open files. + */ + rcu_assign_pointer(uverbs_dev->ib_dev, NULL); + ib_uverbs_free_hw_resources(uverbs_dev, device); + wait_clients = 0; + } + + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); + if (wait_clients) + wait_for_completion(&uverbs_dev->comp); + kobject_put(&uverbs_dev->kobj); } static char *uverbs_devnode(struct device *dev, umode_t *mode) @@ -1378,28 +1329,28 @@ ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register device number\n"); + pr_err("user_verbs: couldn't register device number\n"); goto out; } uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); if (IS_ERR(uverbs_class)) { ret = PTR_ERR(uverbs_class); - printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n"); + pr_err("user_verbs: couldn't create class infiniband_verbs\n"); goto out_chrdev; } uverbs_class->devnode = uverbs_devnode; - ret = class_create_file(uverbs_class, &class_attr_abi_version); + ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n"); + pr_err("user_verbs: couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&uverbs_client); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register client\n"); + pr_err("user_verbs: couldn't register client\n"); goto out_class; } @@ -1431,5 +1382,5 @@ idr_destroy(&ib_uverbs_srq_idr); } -module_init(ib_uverbs_init); +module_init_order(ib_uverbs_init, SI_ORDER_THIRD); module_exit(ib_uverbs_cleanup); Index: sys/ofed/drivers/infiniband/core/ib_uverbs_marshall.c =================================================================== --- sys/ofed/drivers/infiniband/core/ib_uverbs_marshall.c +++ sys/ofed/drivers/infiniband/core/ib_uverbs_marshall.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, @@ -140,5 +139,10 @@ dst->packet_life_time = src->packet_life_time; dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; + + memset(dst->dmac, 0, sizeof(dst->dmac)); + dst->net = NULL; + dst->ifindex = 0; + dst->gid_type = IB_GID_TYPE_IB; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); Index: sys/ofed/drivers/infiniband/core/ib_verbs.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/ib_verbs.c @@ -0,0 +1,2066 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#include "core_priv.h" + +static const char * const ib_events[] = { + [IB_EVENT_CQ_ERR] = "CQ error", + [IB_EVENT_QP_FATAL] = "QP fatal error", + [IB_EVENT_QP_REQ_ERR] = "QP request error", + [IB_EVENT_QP_ACCESS_ERR] = "QP access error", + [IB_EVENT_COMM_EST] = "communication established", + [IB_EVENT_SQ_DRAINED] = "send queue drained", + [IB_EVENT_PATH_MIG] = "path migration successful", + [IB_EVENT_PATH_MIG_ERR] = "path migration error", + [IB_EVENT_DEVICE_FATAL] = "device fatal error", + [IB_EVENT_PORT_ACTIVE] = "port active", + [IB_EVENT_PORT_ERR] = "port error", + [IB_EVENT_LID_CHANGE] = "LID change", + [IB_EVENT_PKEY_CHANGE] = "P_key change", + [IB_EVENT_SM_CHANGE] = "SM change", + [IB_EVENT_SRQ_ERR] = "SRQ error", + [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", + [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", + [IB_EVENT_CLIENT_REREGISTER] = "client reregister", + [IB_EVENT_GID_CHANGE] = "GID changed", +}; + +const char *__attribute_const__ ib_event_msg(enum ib_event_type event) +{ + size_t index = event; + + return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ? + ib_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(ib_event_msg); + +static const char * const wc_statuses[] = { + [IB_WC_SUCCESS] = "success", + [IB_WC_LOC_LEN_ERR] = "local length error", + [IB_WC_LOC_QP_OP_ERR] = "local QP operation error", + [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", + [IB_WC_LOC_PROT_ERR] = "local protection error", + [IB_WC_WR_FLUSH_ERR] = "WR flushed", + [IB_WC_MW_BIND_ERR] = "memory management operation error", + [IB_WC_BAD_RESP_ERR] = "bad response error", + [IB_WC_LOC_ACCESS_ERR] = "local access error", + [IB_WC_REM_INV_REQ_ERR] = "invalid request error", + [IB_WC_REM_ACCESS_ERR] = "remote access error", + [IB_WC_REM_OP_ERR] = "remote operation error", + [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", + [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", + [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", + [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", + [IB_WC_REM_ABORT_ERR] = "operation aborted", + [IB_WC_INV_EECN_ERR] = "invalid EE context number", + [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state", + [IB_WC_FATAL_ERR] = "fatal error", + [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error", + [IB_WC_GENERAL_ERR] = "general error", +}; + +const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status) +{ + size_t index = status; + + return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ? + wc_statuses[index] : "unrecognized status"; +} +EXPORT_SYMBOL(ib_wc_status_msg); + +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mult); + +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) +{ + switch (mult) { + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} +EXPORT_SYMBOL(mult_to_ib_rate); + +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 2500; + case IB_RATE_5_GBPS: return 5000; + case IB_RATE_10_GBPS: return 10000; + case IB_RATE_20_GBPS: return 20000; + case IB_RATE_30_GBPS: return 30000; + case IB_RATE_40_GBPS: return 40000; + case IB_RATE_60_GBPS: return 60000; + case IB_RATE_80_GBPS: return 80000; + case IB_RATE_120_GBPS: return 120000; + case IB_RATE_14_GBPS: return 14062; + case IB_RATE_56_GBPS: return 56250; + case IB_RATE_112_GBPS: return 112500; + case IB_RATE_168_GBPS: return 168750; + case IB_RATE_25_GBPS: return 25781; + case IB_RATE_100_GBPS: return 103125; + case IB_RATE_200_GBPS: return 206250; + case IB_RATE_300_GBPS: return 309375; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mbps); + +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type) +{ + switch (node_type) { + case RDMA_NODE_IB_CA: + case RDMA_NODE_IB_SWITCH: + case RDMA_NODE_IB_ROUTER: + return RDMA_TRANSPORT_IB; + case RDMA_NODE_RNIC: + return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; + case RDMA_NODE_USNIC_UDP: + return RDMA_TRANSPORT_USNIC_UDP; + default: + BUG(); + return 0; + } +} +EXPORT_SYMBOL(rdma_node_get_transport); + +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) +{ + if (device->get_link_layer) + return device->get_link_layer(device, port_num); + + switch (rdma_node_get_transport(device->node_type)) { + case RDMA_TRANSPORT_IB: + return IB_LINK_LAYER_INFINIBAND; + case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: + case RDMA_TRANSPORT_USNIC_UDP: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} +EXPORT_SYMBOL(rdma_port_get_link_layer); + +/* Protection domains */ + +/** + * ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + * + * Every PD has a local_dma_lkey which can be used as the lkey value for local + * memory operations. + */ +struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, + const char *caller) +{ + struct ib_pd *pd; + int mr_access_flags = 0; + + pd = device->alloc_pd(device, NULL, NULL); + if (IS_ERR(pd)) + return pd; + + pd->device = device; + pd->uobject = NULL; + pd->__internal_mr = NULL; + atomic_set(&pd->usecnt, 0); + pd->flags = flags; + + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + pd->local_dma_lkey = device->local_dma_lkey; + else + mr_access_flags |= IB_ACCESS_LOCAL_WRITE; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + pr_warn("%s: enabling unsafe global rkey\n", caller); + mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; + } + + if (mr_access_flags) { + struct ib_mr *mr; + + mr = pd->device->get_dma_mr(pd, mr_access_flags); + if (IS_ERR(mr)) { + ib_dealloc_pd(pd); + return ERR_CAST(mr); + } + + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + mr->need_inval = false; + + pd->__internal_mr = mr; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) + pd->local_dma_lkey = pd->__internal_mr->lkey; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) + pd->unsafe_global_rkey = pd->__internal_mr->rkey; + } + + return pd; +} +EXPORT_SYMBOL(__ib_alloc_pd); + +/** + * ib_dealloc_pd - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + * + * It is an error to call this function while any resources in the pd still + * exist. The caller is responsible to synchronously destroy them and + * guarantee no new allocations will happen. + */ +void ib_dealloc_pd(struct ib_pd *pd) +{ + int ret; + + if (pd->__internal_mr) { + ret = pd->device->dereg_mr(pd->__internal_mr); + WARN_ON(ret); + pd->__internal_mr = NULL; + } + + /* uverbs manipulates usecnt with proper locking, while the kabi + requires the caller to guarantee we can't race here. */ + WARN_ON(atomic_read(&pd->usecnt)); + + /* Making delalloc_pd a void return is a WIP, no driver should return + an error here. */ + ret = pd->device->dealloc_pd(pd); + WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); +} +EXPORT_SYMBOL(ib_dealloc_pd); + +/* Address handles */ + +struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct ib_ah *ah; + + ah = pd->device->create_ah(pd, ah_attr); + + if (!IS_ERR(ah)) { + ah->device = pd->device; + ah->pd = pd; + ah->uobject = NULL; + atomic_inc(&pd->usecnt); + } + + return ah; +} +EXPORT_SYMBOL(ib_create_ah); + +static int ib_get_header_version(const union rdma_network_hdr *hdr) +{ + const struct ip *ip4h = (const struct ip *)&hdr->roce4grh; + struct ip ip4h_checked; + const struct ip6_hdr *ip6h = (const struct ip6_hdr *)&hdr->ibgrh; + + /* If it's IPv6, the version must be 6, otherwise, the first + * 20 bytes (before the IPv4 header) are garbled. + */ + if ((ip6h->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) + return (ip4h->ip_v == 4) ? 4 : 0; + /* version may be 6 or 4 because the first 20 bytes could be garbled */ + + /* RoCE v2 requires no options, thus header length + * must be 5 words + */ + if (ip4h->ip_hl != 5) + return 6; + + /* Verify checksum. + * We can't write on scattered buffers so we need to copy to + * temp buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.ip_sum = 0; + ip4h_checked.ip_sum = in_cksum_hdr(&ip4h_checked); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->ip_sum == ip4h_checked.ip_sum) + return 4; + return 6; +} + +static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, + u8 port_num, + const struct ib_grh *grh) +{ + int grh_version; + + if (rdma_protocol_ib(device, port_num)) + return RDMA_NETWORK_IB; + + grh_version = ib_get_header_version((const union rdma_network_hdr *)grh); + + if (grh_version == 4) + return RDMA_NETWORK_IPV4; + + if (grh->next_hdr == IPPROTO_UDP) + return RDMA_NETWORK_IPV6; + + return RDMA_NETWORK_ROCE_V1; +} + +struct find_gid_index_context { + u16 vlan_id; + enum ib_gid_type gid_type; +}; + +static bool find_gid_index(const union ib_gid *gid, + const struct ib_gid_attr *gid_attr, + void *context) +{ + struct find_gid_index_context *ctx = + (struct find_gid_index_context *)context; + + if (ctx->gid_type != gid_attr->gid_type) + return false; + + if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) || + (is_vlan_dev(gid_attr->ndev) && + vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id)) + return false; + + return true; +} + +static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, + u16 vlan_id, const union ib_gid *sgid, + enum ib_gid_type gid_type, + u16 *gid_index) +{ + struct find_gid_index_context context = {.vlan_id = vlan_id, + .gid_type = gid_type}; + + return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, + &context, gid_index); +} + +static int get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, + enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid) +{ + struct sockaddr_in src_in; + struct sockaddr_in dst_in; + __be32 src_saddr, dst_saddr; + + if (!sgid || !dgid) + return -EINVAL; + + if (net_type == RDMA_NETWORK_IPV4) { + memcpy(&src_in.sin_addr.s_addr, + &hdr->roce4grh.ip_src, 4); + memcpy(&dst_in.sin_addr.s_addr, + &hdr->roce4grh.ip_dst, 4); + src_saddr = src_in.sin_addr.s_addr; + dst_saddr = dst_in.sin_addr.s_addr; + ipv6_addr_set_v4mapped(src_saddr, + (struct in6_addr *)sgid); + ipv6_addr_set_v4mapped(dst_saddr, + (struct in6_addr *)dgid); + return 0; + } else if (net_type == RDMA_NETWORK_IPV6 || + net_type == RDMA_NETWORK_IB) { + *dgid = hdr->ibgrh.dgid; + *sgid = hdr->ibgrh.sgid; + return 0; + } else { + return -EINVAL; + } +} + +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct ib_ah_attr *ah_attr) +{ + u32 flow_class; + u16 gid_index; + int ret; + enum rdma_network_type net_type = RDMA_NETWORK_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + int hoplimit = 0xff; + union ib_gid dgid; + union ib_gid sgid; + + memset(ah_attr, 0, sizeof *ah_attr); + if (rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) + net_type = wc->network_hdr_type; + else + net_type = ib_get_net_type_by_grh(device, port_num, grh); + gid_type = ib_network_to_gid_type(net_type); + } + ret = get_gids_from_rdma_hdr((const union rdma_network_hdr *)grh, net_type, + &sgid, &dgid); + if (ret) + return ret; + + if (rdma_protocol_roce(device, port_num)) { + int if_index = 0; + u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? + wc->vlan_id : 0xffff; + struct net_device *idev; + struct net_device *resolved_dev; + + if (!(wc->wc_flags & IB_WC_GRH)) + return -EPROTOTYPE; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, + &if_index, &hoplimit); + if (ret) { + dev_put(idev); + return ret; + } + + resolved_dev = dev_get_by_index(&init_net, if_index); + if (resolved_dev->if_flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + rcu_read_lock(); + if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, + resolved_dev)) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) + return ret; + + ret = get_sgid_index_from_eth(device, port_num, vlan_id, + &dgid, gid_type, &gid_index); + if (ret) + return ret; + } + + ah_attr->dlid = wc->slid; + ah_attr->sl = wc->sl; + ah_attr->src_path_bits = wc->dlid_path_bits; + ah_attr->port_num = port_num; + + if (wc->wc_flags & IB_WC_GRH) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = sgid; + + if (!rdma_cap_eth_ah(device, port_num)) { + if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { + ret = ib_find_cached_gid_by_port(device, &dgid, + IB_GID_TYPE_IB, + port_num, NULL, + &gid_index); + if (ret) + return ret; + } else { + gid_index = 0; + } + } + + ah_attr->grh.sgid_index = (u8) gid_index; + flow_class = be32_to_cpu(grh->version_tclass_flow); + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->grh.hop_limit = hoplimit; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + } + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_wc); + +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, + const struct ib_grh *grh, u8 port_num) +{ + struct ib_ah_attr ah_attr; + int ret; + + ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + if (ret) + return ERR_PTR(ret); + + return ib_create_ah(pd, &ah_attr); +} +EXPORT_SYMBOL(ib_create_ah_from_wc); + +int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->modify_ah ? + ah->device->modify_ah(ah, ah_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_ah); + +int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->query_ah ? + ah->device->query_ah(ah, ah_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_query_ah); + +int ib_destroy_ah(struct ib_ah *ah) +{ + struct ib_pd *pd; + int ret; + + pd = ah->pd; + ret = ah->device->destroy_ah(ah); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_ah); + +/* Shared receive queues */ + +struct ib_srq *ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr) +{ + struct ib_srq *srq; + + if (!pd->device->create_srq) + return ERR_PTR(-ENOSYS); + + srq = pd->device->create_srq(pd, srq_init_attr, NULL); + + if (!IS_ERR(srq)) { + srq->device = pd->device; + srq->pd = pd; + srq->uobject = NULL; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + srq->srq_type = srq_init_attr->srq_type; + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; + atomic_inc(&srq->ext.xrc.xrcd->usecnt); + atomic_inc(&srq->ext.xrc.cq->usecnt); + } + atomic_inc(&pd->usecnt); + atomic_set(&srq->usecnt, 0); + } + + return srq; +} +EXPORT_SYMBOL(ib_create_srq); + +int ib_modify_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask) +{ + return srq->device->modify_srq ? + srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_srq); + +int ib_query_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr) +{ + return srq->device->query_srq ? + srq->device->query_srq(srq, srq_attr) : -ENOSYS; +} +EXPORT_SYMBOL(ib_query_srq); + +int ib_destroy_srq(struct ib_srq *srq) +{ + struct ib_pd *pd; + enum ib_srq_type srq_type; + struct ib_xrcd *uninitialized_var(xrcd); + struct ib_cq *uninitialized_var(cq); + int ret; + + if (atomic_read(&srq->usecnt)) + return -EBUSY; + + pd = srq->pd; + srq_type = srq->srq_type; + if (srq_type == IB_SRQT_XRC) { + xrcd = srq->ext.xrc.xrcd; + cq = srq->ext.xrc.cq; + } + + ret = srq->device->destroy_srq(srq); + if (!ret) { + atomic_dec(&pd->usecnt); + if (srq_type == IB_SRQT_XRC) { + atomic_dec(&xrcd->usecnt); + atomic_dec(&cq->usecnt); + } + } + + return ret; +} +EXPORT_SYMBOL(ib_destroy_srq); + +/* Queue pairs */ + +static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = context; + unsigned long flags; + + spin_lock_irqsave(&qp->device->event_handler_lock, flags); + list_for_each_entry(event->element.qp, &qp->open_list, open_list) + if (event->element.qp->event_handler) + event->element.qp->event_handler(event, event->element.qp->qp_context); + spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); +} + +static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) +{ + mutex_lock(&xrcd->tgt_qp_mutex); + list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); + mutex_unlock(&xrcd->tgt_qp_mutex); +} + +static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, + void (*event_handler)(struct ib_event *, void *), + void *qp_context) +{ + struct ib_qp *qp; + unsigned long flags; + + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->real_qp = real_qp; + atomic_inc(&real_qp->usecnt); + qp->device = real_qp->device; + qp->event_handler = event_handler; + qp->qp_context = qp_context; + qp->qp_num = real_qp->qp_num; + qp->qp_type = real_qp->qp_type; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_add(&qp->open_list, &real_qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + return qp; +} + +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr) +{ + struct ib_qp *qp, *real_qp; + + if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) + return ERR_PTR(-EINVAL); + + qp = ERR_PTR(-EINVAL); + mutex_lock(&xrcd->tgt_qp_mutex); + list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { + if (real_qp->qp_num == qp_open_attr->qp_num) { + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + break; + } + } + mutex_unlock(&xrcd->tgt_qp_mutex); + return qp; +} +EXPORT_SYMBOL(ib_open_qp); + +static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *real_qp = qp; + + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + return qp; +} + +struct ib_qp *ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_qp *qp; + + if (qp_init_attr->rwq_ind_tbl && + (qp_init_attr->recv_cq || + qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || + qp_init_attr->cap.max_recv_sge)) + return ERR_PTR(-EINVAL); + + qp = device->create_qp(pd, qp_init_attr, NULL); + if (IS_ERR(qp)) + return qp; + + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; + + atomic_set(&qp->usecnt, 0); + spin_lock_init(&qp->mr_lock); + + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) + return ib_create_xrc_qp(qp, qp_init_attr); + + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + if (qp_init_attr->recv_cq) + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } + + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; + + atomic_inc(&pd->usecnt); + if (qp_init_attr->send_cq) + atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->rwq_ind_tbl) + atomic_inc(&qp->rwq_ind_tbl->usecnt); + + /* + * Note: all hw drivers guarantee that max_send_sge is lower than + * the device RDMA WRITE SGE limit but not all hw drivers ensure that + * max_send_sge <= max_sge_rd. + */ + qp->max_write_sge = qp_init_attr->cap.max_send_sge; + qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, + device->attrs.max_sge_rd); + + return qp; +} +EXPORT_SYMBOL(ib_create_qp); + +static const struct { + int valid; + enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; +} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_RAW_PACKET] = IB_QP_PORT, + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + [IB_QPS_RTR] = { + .valid = 1, + .req_param = { + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_RC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_XRC_TGT] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_RC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + }, + }, + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = IB_QP_SQ_PSN, + [IB_QPT_UC] = IB_QP_SQ_PSN, + [IB_QPT_RC] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | + IB_QP_SQ_PSN), + [IB_QPT_SMI] = IB_QP_SQ_PSN, + [IB_QPT_GSI] = IB_QP_SQ_PSN, + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ + [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY + } + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 } + } +}; + +int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask, + enum rdma_link_layer ll) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (cur_state < 0 || cur_state > IB_QPS_ERR || + next_state < 0 || next_state > IB_QPS_ERR) + return 0; + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) + return 0; + + if (!qp_state_table[cur_state][next_state].valid) + return 0; + + req_param = qp_state_table[cur_state][next_state].req_param[type]; + opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + + if ((mask & req_param) != req_param) + return 0; + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) + return 0; + + return 1; +} +EXPORT_SYMBOL(ib_modify_qp_is_ok); + +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + int ret = 0; + + if (*qp_attr_mask & IB_QP_AV) { + if (qp_attr->ah_attr.port_num < rdma_start_port(qp->device) || + qp_attr->ah_attr.port_num > rdma_end_port(qp->device)) + return -EINVAL; + + if (!rdma_cap_eth_ah(qp->device, qp_attr->ah_attr.port_num)) + return 0; + + if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { + rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, + qp_attr->ah_attr.dmac); + } else { + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + int ifindex; + int hop_limit; + + ret = ib_query_gid(qp->device, + qp_attr->ah_attr.port_num, + qp_attr->ah_attr.grh.sgid_index, + &sgid, &sgid_attr); + + if (ret || !sgid_attr.ndev) { + if (!ret) + ret = -ENXIO; + goto out; + } + + ifindex = sgid_attr.ndev->if_index; + + ret = rdma_addr_find_l2_eth_by_grh(&sgid, + &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, + NULL, &ifindex, &hop_limit); + + dev_put(sgid_attr.ndev); + + qp_attr->ah_attr.grh.hop_limit = hop_limit; + } + } +out: + return ret; +} +EXPORT_SYMBOL(ib_resolve_eth_dmac); + + +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + int ret; + + ret = ib_resolve_eth_dmac(qp, qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); +} +EXPORT_SYMBOL(ib_modify_qp); + +int ib_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + return qp->device->query_qp ? + qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_query_qp); + +int ib_close_qp(struct ib_qp *qp) +{ + struct ib_qp *real_qp; + unsigned long flags; + + real_qp = qp->real_qp; + if (real_qp == qp) + return -EINVAL; + + spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + list_del(&qp->open_list); + spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + + atomic_dec(&real_qp->usecnt); + kfree(qp); + + return 0; +} +EXPORT_SYMBOL(ib_close_qp); + +static int __ib_destroy_shared_qp(struct ib_qp *qp) +{ + struct ib_xrcd *xrcd; + struct ib_qp *real_qp; + int ret; + + real_qp = qp->real_qp; + xrcd = real_qp->xrcd; + + mutex_lock(&xrcd->tgt_qp_mutex); + ib_close_qp(qp); + if (atomic_read(&real_qp->usecnt) == 0) + list_del(&real_qp->xrcd_list); + else + real_qp = NULL; + mutex_unlock(&xrcd->tgt_qp_mutex); + + if (real_qp) { + ret = ib_destroy_qp(real_qp); + if (!ret) + atomic_dec(&xrcd->usecnt); + else + __ib_insert_xrcd_qp(xrcd, real_qp); + } + + return 0; +} + +int ib_destroy_qp(struct ib_qp *qp) +{ + struct ib_pd *pd; + struct ib_cq *scq, *rcq; + struct ib_srq *srq; + struct ib_rwq_ind_table *ind_tbl; + int ret; + + if (atomic_read(&qp->usecnt)) + return -EBUSY; + + if (qp->real_qp != qp) + return __ib_destroy_shared_qp(qp); + + pd = qp->pd; + scq = qp->send_cq; + rcq = qp->recv_cq; + srq = qp->srq; + ind_tbl = qp->rwq_ind_tbl; + + ret = qp->device->destroy_qp(qp); + if (!ret) { + if (pd) + atomic_dec(&pd->usecnt); + if (scq) + atomic_dec(&scq->usecnt); + if (rcq) + atomic_dec(&rcq->usecnt); + if (srq) + atomic_dec(&srq->usecnt); + if (ind_tbl) + atomic_dec(&ind_tbl->usecnt); + } + + return ret; +} +EXPORT_SYMBOL(ib_destroy_qp); + +/* Completion queues */ + +struct ib_cq *ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, + const struct ib_cq_init_attr *cq_attr) +{ + struct ib_cq *cq; + + cq = device->create_cq(device, cq_attr, NULL, NULL); + + if (!IS_ERR(cq)) { + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); + } + + return cq; +} +EXPORT_SYMBOL(ib_create_cq); + +int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + return cq->device->modify_cq ? + cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS; +} +EXPORT_SYMBOL(ib_modify_cq); + +int ib_destroy_cq(struct ib_cq *cq) +{ + if (atomic_read(&cq->usecnt)) + return -EBUSY; + + return cq->device->destroy_cq(cq); +} +EXPORT_SYMBOL(ib_destroy_cq); + +int ib_resize_cq(struct ib_cq *cq, int cqe) +{ + return cq->device->resize_cq ? + cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS; +} +EXPORT_SYMBOL(ib_resize_cq); + +/* Memory regions */ + +int ib_dereg_mr(struct ib_mr *mr) +{ + struct ib_pd *pd = mr->pd; + int ret; + + ret = mr->device->dereg_mr(mr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_dereg_mr); + +/** + * ib_alloc_mr() - Allocates a memory region + * @pd: protection domain associated with the region + * @mr_type: memory region type + * @max_num_sg: maximum sg entries available for registration. + * + * Notes: + * Memory registeration page/sg lists must not exceed max_num_sg. + * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed + * max_num_sg * used_page_size. + * + */ +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct ib_mr *mr; + + if (!pd->device->alloc_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->alloc_mr(pd, mr_type, max_num_sg); + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + } + + return mr; +} +EXPORT_SYMBOL(ib_alloc_mr); + +/* "Fast" memory regions */ + +struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ib_fmr *fmr; + + if (!pd->device->alloc_fmr) + return ERR_PTR(-ENOSYS); + + fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); + if (!IS_ERR(fmr)) { + fmr->device = pd->device; + fmr->pd = pd; + atomic_inc(&pd->usecnt); + } + + return fmr; +} +EXPORT_SYMBOL(ib_alloc_fmr); + +int ib_unmap_fmr(struct list_head *fmr_list) +{ + struct ib_fmr *fmr; + + if (list_empty(fmr_list)) + return 0; + + fmr = list_entry(fmr_list->next, struct ib_fmr, list); + return fmr->device->unmap_fmr(fmr_list); +} +EXPORT_SYMBOL(ib_unmap_fmr); + +int ib_dealloc_fmr(struct ib_fmr *fmr) +{ + struct ib_pd *pd; + int ret; + + pd = fmr->pd; + ret = fmr->device->dealloc_fmr(fmr); + if (!ret) + atomic_dec(&pd->usecnt); + + return ret; +} +EXPORT_SYMBOL(ib_dealloc_fmr); + +/* Multicast groups */ + +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->attach_mcast) + return -ENOSYS; + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + + ret = qp->device->attach_mcast(qp, gid, lid); + if (!ret) + atomic_inc(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_attach_mcast); + +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->detach_mcast) + return -ENOSYS; + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + + ret = qp->device->detach_mcast(qp, gid, lid); + if (!ret) + atomic_dec(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_detach_mcast); + +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +{ + struct ib_xrcd *xrcd; + + if (!device->alloc_xrcd) + return ERR_PTR(-ENOSYS); + + xrcd = device->alloc_xrcd(device, NULL, NULL); + if (!IS_ERR(xrcd)) { + xrcd->device = device; + xrcd->inode = NULL; + atomic_set(&xrcd->usecnt, 0); + mutex_init(&xrcd->tgt_qp_mutex); + INIT_LIST_HEAD(&xrcd->tgt_qp_list); + } + + return xrcd; +} +EXPORT_SYMBOL(ib_alloc_xrcd); + +int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct ib_qp *qp; + int ret; + + if (atomic_read(&xrcd->usecnt)) + return -EBUSY; + + while (!list_empty(&xrcd->tgt_qp_list)) { + qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); + ret = ib_destroy_qp(qp); + if (ret) + return ret; + } + + return xrcd->device->dealloc_xrcd(xrcd); +} +EXPORT_SYMBOL(ib_dealloc_xrcd); + +/** + * ib_create_wq - Creates a WQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the WQ. + * @wq_init_attr: A list of initial attributes required to create the + * WQ. If WQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created WQ. + * + * wq_init_attr->max_wr and wq_init_attr->max_sge determine + * the requested size of the WQ, and set to the actual values allocated + * on return. + * If ib_create_wq() succeeds, then max_wr and max_sge will always be + * at least as large as the requested values. + */ +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *wq_attr) +{ + struct ib_wq *wq; + + if (!pd->device->create_wq) + return ERR_PTR(-ENOSYS); + + wq = pd->device->create_wq(pd, wq_attr, NULL); + if (!IS_ERR(wq)) { + wq->event_handler = wq_attr->event_handler; + wq->wq_context = wq_attr->wq_context; + wq->wq_type = wq_attr->wq_type; + wq->cq = wq_attr->cq; + wq->device = pd->device; + wq->pd = pd; + wq->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&wq_attr->cq->usecnt); + atomic_set(&wq->usecnt, 0); + } + return wq; +} +EXPORT_SYMBOL(ib_create_wq); + +/** + * ib_destroy_wq - Destroys the specified WQ. + * @wq: The WQ to destroy. + */ +int ib_destroy_wq(struct ib_wq *wq) +{ + int err; + struct ib_cq *cq = wq->cq; + struct ib_pd *pd = wq->pd; + + if (atomic_read(&wq->usecnt)) + return -EBUSY; + + err = wq->device->destroy_wq(wq); + if (!err) { + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + } + return err; +} +EXPORT_SYMBOL(ib_destroy_wq); + +/** + * ib_modify_wq - Modifies the specified WQ. + * @wq: The WQ to modify. + * @wq_attr: On input, specifies the WQ attributes to modify. + * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ + * are being modified. + * On output, the current values of selected WQ attributes are returned. + */ +int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask) +{ + int err; + + if (!wq->device->modify_wq) + return -ENOSYS; + + err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL); + return err; +} +EXPORT_SYMBOL(ib_modify_wq); + +/* + * ib_create_rwq_ind_table - Creates a RQ Indirection Table. + * @device: The device on which to create the rwq indirection table. + * @ib_rwq_ind_table_init_attr: A list of initial attributes required to + * create the Indirection Table. + * + * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less + * than the created ib_rwq_ind_table object and the caller is responsible + * for its memory allocation/free. + */ +struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr) +{ + struct ib_rwq_ind_table *rwq_ind_table; + int i; + u32 table_size; + + if (!device->create_rwq_ind_table) + return ERR_PTR(-ENOSYS); + + table_size = (1 << init_attr->log_ind_tbl_size); + rwq_ind_table = device->create_rwq_ind_table(device, + init_attr, NULL); + if (IS_ERR(rwq_ind_table)) + return rwq_ind_table; + + rwq_ind_table->ind_tbl = init_attr->ind_tbl; + rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size; + rwq_ind_table->device = device; + rwq_ind_table->uobject = NULL; + atomic_set(&rwq_ind_table->usecnt, 0); + + for (i = 0; i < table_size; i++) + atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt); + + return rwq_ind_table; +} +EXPORT_SYMBOL(ib_create_rwq_ind_table); + +/* + * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table. + * @wq_ind_table: The Indirection Table to destroy. +*/ +int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) +{ + int err, i; + u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); + struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; + + if (atomic_read(&rwq_ind_table->usecnt)) + return -EBUSY; + + err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table); + if (!err) { + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); + } + + return err; +} +EXPORT_SYMBOL(ib_destroy_rwq_ind_table); + +struct ib_flow *ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct ib_flow *flow_id; + if (!qp->device->create_flow) + return ERR_PTR(-ENOSYS); + + flow_id = qp->device->create_flow(qp, flow_attr, domain); + if (!IS_ERR(flow_id)) + atomic_inc(&qp->usecnt); + return flow_id; +} +EXPORT_SYMBOL(ib_create_flow); + +int ib_destroy_flow(struct ib_flow *flow_id) +{ + int err; + struct ib_qp *qp = flow_id->qp; + + err = qp->device->destroy_flow(flow_id); + if (!err) + atomic_dec(&qp->usecnt); + return err; +} +EXPORT_SYMBOL(ib_destroy_flow); + +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + return mr->device->check_mr_status ? + mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; +} +EXPORT_SYMBOL(ib_check_mr_status); + +int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, + int state) +{ + if (!device->set_vf_link_state) + return -ENOSYS; + + return device->set_vf_link_state(device, vf, port, state); +} +EXPORT_SYMBOL(ib_set_vf_link_state); + +int ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info) +{ + if (!device->get_vf_config) + return -ENOSYS; + + return device->get_vf_config(device, vf, port, info); +} +EXPORT_SYMBOL(ib_get_vf_config); + +int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats) +{ + if (!device->get_vf_stats) + return -ENOSYS; + + return device->get_vf_stats(device, vf, port, stats); +} +EXPORT_SYMBOL(ib_get_vf_stats); + +int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, + int type) +{ + if (!device->set_vf_guid) + return -ENOSYS; + + return device->set_vf_guid(device, vf, port, guid, type); +} +EXPORT_SYMBOL(ib_set_vf_guid); + +/** + * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list + * and set it the memory region. + * @mr: memory region + * @sg: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg + * @page_size: page vector desired page size + * + * Constraints: + * - The first sg element is allowed to have an offset. + * - Each sg element must either be aligned to page_size or virtually + * contiguous to the previous element. In case an sg element has a + * non-contiguous offset, the mapping prefix will not include it. + * - The last sg element is allowed to have length less than page_size. + * - If sg_nents total byte length exceeds the mr max_num_sge * page_size + * then only max_num_sg entries will be mapped. + * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these + * constraints holds and the page_size argument is ignored. + * + * Returns the number of sg elements that were mapped to the memory region. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) +{ + if (unlikely(!mr->device->map_mr_sg)) + return -ENOSYS; + + mr->page_size = page_size; + + return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset); +} +EXPORT_SYMBOL(ib_map_mr_sg); + +/** + * ib_sg_to_pages() - Convert the largest prefix of a sg list + * to a page vector + * @mr: memory region + * @sgl: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset_p: IN: start offset in bytes into sg + * OUT: offset in bytes for element n of the sg of the first + * byte that has not been processed where n is the return + * value of this function. + * @set_page: driver page assignment function pointer + * + * Core service helper for drivers to convert the largest + * prefix of given sg list to a page vector. The sg list + * prefix converted is the prefix that meet the requirements + * of ib_map_mr_sg. + * + * Returns the number of sg elements that were assigned to + * a page vector. + */ +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) +{ + struct scatterlist *sg; + u64 last_end_dma_addr = 0; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + unsigned int last_page_off = 0; + u64 page_mask = ~((u64)mr->page_size - 1); + int i, ret; + + if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) + return -EINVAL; + + mr->iova = sg_dma_address(&sgl[0]) + sg_offset; + mr->length = 0; + + for_each_sg(sgl, sg, sg_nents, i) { + u64 dma_addr = sg_dma_address(sg) + sg_offset; + u64 prev_addr = dma_addr; + unsigned int dma_len = sg_dma_len(sg) - sg_offset; + u64 end_dma_addr = dma_addr + dma_len; + u64 page_addr = dma_addr & page_mask; + + /* + * For the second and later elements, check whether either the + * end of element i-1 or the start of element i is not aligned + * on a page boundary. + */ + if (i && (last_page_off != 0 || page_addr != dma_addr)) { + /* Stop mapping if there is a gap. */ + if (last_end_dma_addr != dma_addr) + break; + + /* + * Coalesce this element with the last. If it is small + * enough just update mr->length. Otherwise start + * mapping from the next page. + */ + goto next_page; + } + + do { + ret = set_page(mr, page_addr); + if (unlikely(ret < 0)) { + sg_offset = prev_addr - sg_dma_address(sg); + mr->length += prev_addr - dma_addr; + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i || sg_offset ? i : ret; + } + prev_addr = page_addr; +next_page: + page_addr += mr->page_size; + } while (page_addr < end_dma_addr); + + mr->length += dma_len; + last_end_dma_addr = end_dma_addr; + last_page_off = end_dma_addr & ~page_mask; + + sg_offset = 0; + } + + if (sg_offset_p) + *sg_offset_p = 0; + return i; +} +EXPORT_SYMBOL(ib_sg_to_pages); + +struct ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* + * Post a WR and block until its completion is reaped for the SQ. + */ +static void __ib_drain_sq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe sdrain; + struct ib_send_wr swr = {}, *bad_swr; + int ret; + + if (qp->send_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->send_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + swr.wr_cqe = &sdrain.cqe; + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + ret = ib_post_send(qp, &swr, &bad_swr); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + wait_for_completion(&sdrain.done); +} + +/* + * Post a WR and block until its completion is reaped for the RQ. + */ +static void __ib_drain_rq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}, *bad_rwr; + int ret; + + if (qp->recv_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->recv_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + ret = ib_post_recv(qp, &rwr, &bad_rwr); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + wait_for_completion(&rdrain.done); +} + +/** + * ib_drain_sq() - Block until all SQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_sq(). + * + * The caller must: + * + * ensure there is room in the CQ and SQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_sq(struct ib_qp *qp) +{ + if (qp->device->drain_sq) + qp->device->drain_sq(qp); + else + __ib_drain_sq(qp); +} +EXPORT_SYMBOL(ib_drain_sq); + +/** + * ib_drain_rq() - Block until all RQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_rq(). + * + * The caller must: + * + * ensure there is room in the CQ and RQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_rq(struct ib_qp *qp) +{ + if (qp->device->drain_rq) + qp->device->drain_rq(qp); + else + __ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_rq); + +/** + * ib_drain_qp() - Block until all CQEs have been consumed by the + * application on both the RQ and SQ. + * @qp: queue pair to drain + * + * The caller must: + * + * ensure there is room in the CQ(s), SQ, and RQ for drain work requests + * and completions. + * + * allocate the CQs using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_qp(struct ib_qp *qp) +{ + ib_drain_sq(qp); + if (!qp->srq) + ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_qp); Index: sys/ofed/drivers/infiniband/core/iwcm.h =================================================================== --- sys/ofed/drivers/infiniband/core/iwcm.h +++ sys/ofed/drivers/infiniband/core/iwcm.h @@ -56,7 +56,7 @@ struct list_head work_free_list; }; -#define IWCM_F_CALLBACK_DESTROY 1 +#define IWCM_F_DROP_EVENTS 1 #define IWCM_F_CONNECT_WAIT 2 #endif /* IWCM_H */ Index: sys/ofed/drivers/infiniband/core/iwpm_util.h =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/iwpm_util.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWPM_UTIL_H +#define _IWPM_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define IWPM_PID_UNDEFINED -1 +#define IWPM_PID_UNAVAILABLE -2 + +#define IWPM_REG_UNDEF 0x01 +#define IWPM_REG_VALID 0x02 +#define IWPM_REG_INCOMPL 0x04 + +/** + * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * + * Returns 0 if they are holding the same ip/tcp address info, + * otherwise returns 1 + */ +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr); + +/** + * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port + * @sockaddr: Socket address to print + * @msg: Message to print + */ +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); +#endif Index: sys/ofed/drivers/infiniband/core/mad_priv.h =================================================================== --- sys/ofed/drivers/infiniband/core/mad_priv.h +++ sys/ofed/drivers/infiniband/core/mad_priv.h @@ -41,9 +41,7 @@ #include #include #include - - -#define PFX "ib_mad: " +#include #define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ @@ -59,13 +57,14 @@ /* Registration table sizes */ #define MAX_MGMT_CLASS 80 -#define MAX_MGMT_VERSION 8 +#define MAX_MGMT_VERSION 0x83 #define MAX_MGMT_OUI 8 #define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1) struct ib_mad_list_head { struct list_head list; + struct ib_cqe cqe; struct ib_mad_queue *mad_queue; }; @@ -78,12 +77,9 @@ struct ib_mad_private { struct ib_mad_private_header header; + size_t mad_size; struct ib_grh grh; - union { - struct ib_mad mad; - struct ib_rmpp_mad rmpp_mad; - struct ib_smp smp; - } mad; + u8 mad[0]; } __attribute__ ((packed)); struct ib_rmpp_segment { @@ -121,14 +117,6 @@ struct completion comp; }; -/* Structure for timeout-fifo entry */ -struct tf_entry { - unsigned long exp_time; /* entry expiration time */ - struct list_head fifo_list; /* to keep entries in fifo order */ - struct list_head to_list; /* to keep entries in timeout order */ - int canceled; /* indicates whether entry is canceled */ -}; - struct ib_mad_send_wr_private { struct ib_mad_list_head mad_list; struct list_head agent_list; @@ -136,7 +124,7 @@ struct ib_mad_send_buf send_buf; u64 header_mapping; u64 payload_mapping; - struct ib_send_wr send_wr; + struct ib_ud_wr send_wr; struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; __be64 tid; unsigned long timeout; @@ -154,10 +142,6 @@ int seg_num; int newwin; int pad; - - /* SA congestion controlled MAD */ - int is_sa_cc_mad; - struct tf_entry tf_list; }; struct ib_mad_local_private { @@ -165,6 +149,7 @@ struct ib_mad_private *mad_priv; struct ib_mad_agent_private *recv_mad_agent; struct ib_mad_send_wr_private *mad_send_wr; + size_t return_wc_byte_len; }; struct ib_mad_mgmt_method_table { @@ -209,47 +194,25 @@ atomic_t snoop_count; }; -struct to_fifo { - struct list_head to_head; - struct list_head fifo_head; - spinlock_t lists_lock; - struct timer_list timer; - struct work_struct work; - u32 fifo_size; - u32 num_items; - int stop_enqueue; - struct workqueue_struct *workq; -}; - -/* SA congestion control data */ -struct sa_cc_data { - spinlock_t lock; - unsigned long outstanding; - struct to_fifo *tf; -}; - struct ib_mad_port_private { struct list_head port_list; struct ib_device *device; int port_num; struct ib_cq *cq; struct ib_pd *pd; - struct ib_mr *mr; spinlock_t reg_lock; struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; struct list_head agent_list; struct workqueue_struct *wq; - struct work_struct work; struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; - struct sa_cc_data sa_cc; }; int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr); struct ib_mad_send_wr_private * -ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_recv_wc *mad_recv_wc); +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *mad_recv_wc); void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_send_wc *mad_send_wc); Index: sys/ofed/drivers/infiniband/core/opa_smi.h =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/opa_smi.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __OPA_SMI_H_ +#define __OPA_SMI_H_ + +#include +#include + +#include "smi.h" + +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + int port_num, int phys_port_cnt); +int opa_smi_get_fwd_port(struct opa_smp *smp); +extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp); +extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, int port_num); + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-9:3 -- We're at the end of the DR segment of path */ + /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */ + return (device->process_mad && + !opa_get_smp_direction(smp) && + (smp->hop_ptr == smp->hop_cnt + 1)) ? + IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-13:3 -- We're at the end of the DR segment of path */ + /* C14-13:4 -- Hop Pointer == 0 -> give to SM */ + return (device->process_mad && + opa_get_smp_direction(smp) && + !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +#endif /* __OPA_SMI_H_ */ Index: sys/ofed/drivers/infiniband/core/peer_mem.c =================================================================== --- sys/ofed/drivers/infiniband/core/peer_mem.c +++ /dev/null @@ -1,461 +0,0 @@ -/* - * Copyright (c) 2013, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include - -static DEFINE_MUTEX(peer_memory_mutex); -static LIST_HEAD(peer_memory_list); - -static int num_registered_peers; - -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * * Will be added in future to the sysctl */ - -#if 0 -static struct kobject *peers_kobj; -static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj); -static ssize_t version_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%s\n", ib_peer_client->peer_mem->version); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_alloc_mrs_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_alloc_mrs); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_reg_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_reg_pages); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_dereg_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_dereg_pages); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static ssize_t num_free_callbacks_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj); - - if (ib_peer_client) { - sprintf(buf, "%lu\n", ib_peer_client->stats.num_free_callbacks); - return strlen(buf); - } - /* not found - nothing is return */ - return 0; -} - -static struct kobj_attribute version_attr = __ATTR_RO(version); -static struct kobj_attribute num_alloc_mrs = __ATTR_RO(num_alloc_mrs); -static struct kobj_attribute num_reg_pages = __ATTR_RO(num_reg_pages); -static struct kobj_attribute num_dereg_pages = __ATTR_RO(num_dereg_pages); -static struct kobj_attribute num_free_callbacks = __ATTR_RO(num_free_callbacks); - -static struct attribute *peer_mem_attrs[] = { - &version_attr.attr, - &num_alloc_mrs.attr, - &num_reg_pages.attr, - &num_dereg_pages.attr, - &num_free_callbacks.attr, - NULL, -}; -#endif - -#if 0 -static void destroy_peer_sysfs(struct ib_peer_memory_client *ib_peer_client) -{ - kobject_put(ib_peer_client->kobj); - if (!num_registered_peers) - kobject_put(peers_kobj); - - return; -} - -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * Will be added in future to the sysctl */ - -static int create_peer_sysfs(struct ib_peer_memory_client *ib_peer_client) -{ - int ret; - - if (!num_registered_peers) { - /* creating under /sys/kernel/mm */ - peers_kobj = kobject_create_and_add("memory_peers", mm_kobj); - if (!peers_kobj) - return -ENOMEM; - } - - ib_peer_client->peer_mem_attr_group.attrs = peer_mem_attrs; - /* Dir alreday was created explicitly to get its kernel object for further usage */ - ib_peer_client->peer_mem_attr_group.name = NULL; - ib_peer_client->kobj = kobject_create_and_add(ib_peer_client->peer_mem->name, - peers_kobj); - - if (!ib_peer_client->kobj) { - ret = -EINVAL; - goto free; - } - - /* Create the files associated with this kobject */ - ret = sysfs_create_group(ib_peer_client->kobj, - &ib_peer_client->peer_mem_attr_group); - if (ret) - goto peer_free; - - return 0; - -peer_free: - kobject_put(ib_peer_client->kobj); - -free: - if (!num_registered_peers) - kobject_put(peers_kobj); - - return ret; -} -#endif - -static int ib_invalidate_peer_memory(void *reg_handle, - void *core_context) -{ - struct ib_peer_memory_client *ib_peer_client = - (struct ib_peer_memory_client *)reg_handle; - struct invalidation_ctx *invalidation_ctx; - struct core_ticket *core_ticket; - int need_unlock = 1; - - mutex_lock(&ib_peer_client->lock); - ib_peer_client->stats.num_free_callbacks += 1; - core_ticket = ib_peer_search_context(ib_peer_client, - (unsigned long)core_context); - if (!core_ticket) - goto out; - - invalidation_ctx = (struct invalidation_ctx *)core_ticket->context; - /* If context not ready yet mark to be invalidated */ - if (!invalidation_ctx->func) { - invalidation_ctx->peer_invalidated = 1; - goto out; - } - - invalidation_ctx->func(invalidation_ctx->cookie, - invalidation_ctx->umem, 0, 0); - if (invalidation_ctx->inflight_invalidation) { - - /* init the completion to wait on before letting other thread to run */ - init_completion(&invalidation_ctx->comp); - mutex_unlock(&ib_peer_client->lock); - need_unlock = 0; - wait_for_completion(&invalidation_ctx->comp); - } - - kfree(invalidation_ctx); - -out: - if (need_unlock) - mutex_unlock(&ib_peer_client->lock); - - return 0; -} - -/* access to that peer client is under its lock - no extra lock is needed */ -unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, - void *context) -{ - struct core_ticket *core_ticket = kzalloc(sizeof(*core_ticket), GFP_KERNEL); - - ib_peer_client->last_ticket++; - core_ticket->context = context; - core_ticket->key = ib_peer_client->last_ticket; - - list_add_tail(&core_ticket->ticket_list, - &ib_peer_client->core_ticket_list); - - return core_ticket->key; -} - -int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key) -{ - struct core_ticket *core_ticket, *tmp; - - list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list, - ticket_list) { - if (core_ticket->key == key) { - list_del(&core_ticket->ticket_list); - kfree(core_ticket); - return 0; - } - } - - return 1; -} - -struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key) -{ - struct core_ticket *core_ticket, *tmp; - list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list, - ticket_list) { - if (core_ticket->key == key) - return core_ticket; - } - - return NULL; -} - - -static int ib_memory_peer_check_mandatory(struct peer_memory_client - *peer_client) -{ -#define PEER_MEM_MANDATORY_FUNC(x) {\ - offsetof(struct peer_memory_client, x), #x } - - static const struct { - size_t offset; - char *name; - } mandatory_table[] = { - PEER_MEM_MANDATORY_FUNC(acquire), - PEER_MEM_MANDATORY_FUNC(get_pages), - PEER_MEM_MANDATORY_FUNC(put_pages), - PEER_MEM_MANDATORY_FUNC(get_page_size), - PEER_MEM_MANDATORY_FUNC(dma_map), - PEER_MEM_MANDATORY_FUNC(dma_unmap) - }; - int i; - - for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { - if (!*(void **) ((void *) peer_client + mandatory_table[i].offset)) { - printk(KERN_WARNING "Peer memory %s is missing mandatory function %s\n", - peer_client->name, mandatory_table[i].name); - return -EINVAL; - } - } - - return 0; -} - - - -void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, - invalidate_peer_memory *invalidate_callback) -{ - int ret = 0; - struct ib_peer_memory_client *ib_peer_client = NULL; - - mutex_lock(&peer_memory_mutex); - if (ib_memory_peer_check_mandatory(peer_client)) { - ret = -EINVAL; - goto out; - } - - ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL); - if (!ib_peer_client) - goto out; - ib_peer_client->peer_mem = peer_client; - - INIT_LIST_HEAD(&ib_peer_client->core_ticket_list); - mutex_init(&ib_peer_client->lock); -#ifdef __FreeBSD__ - ib_peer_client->holdcount = 0; - ib_peer_client->needwakeup = 0; - cv_init(&ib_peer_client->peer_cv, "ibprcl"); -#else - ret = init_srcu_struct(&ib_peer_client->peer_srcu); - if (ret) - goto free; -#endif -#if 0 - if (create_peer_sysfs(ib_peer_client)) - goto free; -#endif - *invalidate_callback = ib_invalidate_peer_memory; - list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list); - num_registered_peers++; - goto out; -#if 0 -free: - kfree(ib_peer_client); - ib_peer_client = NULL; -#endif -out: - mutex_unlock(&peer_memory_mutex); - return ib_peer_client; -} -EXPORT_SYMBOL(ib_register_peer_memory_client); - -void ib_unregister_peer_memory_client(void *reg_handle) -{ - struct ib_peer_memory_client *ib_peer_client = - (struct ib_peer_memory_client *)reg_handle; - - mutex_lock(&peer_memory_mutex); - /* remove from list to prevent future core clients usage as it goes down */ - list_del(&ib_peer_client->core_peer_list); -#ifdef __FreeBSD__ - while (ib_peer_client->holdcount != 0) { - ib_peer_client->needwakeup = 1; - cv_wait(&ib_peer_client->peer_cv, &peer_memory_mutex.sx); - } - cv_destroy(&ib_peer_client->peer_cv); -#else - mutex_unlock(&peer_memory_mutex); - /* peer memory can't go down while there are active clients */ - synchronize_srcu(&ib_peer_client->peer_srcu); - cleanup_srcu_struct(&ib_peer_client->peer_srcu); - mutex_lock(&peer_memory_mutex); -#endif - num_registered_peers--; -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * Will be added in future to the sysctl */ -#if 0 - destroy_peer_sysfs(ib_peer_client); -#endif - mutex_unlock(&peer_memory_mutex); - - kfree(ib_peer_client); -} -EXPORT_SYMBOL(ib_unregister_peer_memory_client); - -/* This code uses the sysfs which is not supporeted by the FreeBSD. - * Will be added in future to the sysctl */ - -#if 0 -static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj) -{ - struct ib_peer_memory_client *ib_peer_client; - - mutex_lock(&peer_memory_mutex); - list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) { - if (ib_peer_client->kobj == kobj) - goto found; - } - - ib_peer_client = NULL; - -found: - - mutex_unlock(&peer_memory_mutex); - return ib_peer_client; -} -#endif - -struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr, - size_t size, void **peer_client_context, - int *srcu_key) -{ - struct ib_peer_memory_client *ib_peer_client; - int ret; - - mutex_lock(&peer_memory_mutex); - list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) { - ret = ib_peer_client->peer_mem->acquire(addr, size, - context->peer_mem_private_data, - context->peer_mem_name, - peer_client_context); - if (ret == 1) - goto found; - } - - ib_peer_client = NULL; - -found: - if (ib_peer_client) { -#ifdef __FreeBSD__ - ib_peer_client->holdcount++; -#else - *srcu_key = srcu_read_lock(&ib_peer_client->peer_srcu); -#endif - } - - mutex_unlock(&peer_memory_mutex); - return ib_peer_client; - -} -EXPORT_SYMBOL(ib_get_peer_client); - -void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, - void *peer_client_context, - int srcu_key) -{ - - if (ib_peer_client->peer_mem->release) - ib_peer_client->peer_mem->release(peer_client_context); - -#ifdef __FreeBSD__ - ib_peer_client->holdcount--; - if (ib_peer_client->holdcount == 0 && ib_peer_client->needwakeup) { - cv_signal(&ib_peer_client->peer_cv); - } -#else - srcu_read_unlock(&ib_peer_client->peer_srcu, srcu_key); -#endif - return; -} -EXPORT_SYMBOL(ib_put_peer_client); - Index: sys/ofed/drivers/infiniband/core/smi.h =================================================================== --- sys/ofed/drivers/infiniband/core/smi.h +++ sys/ofed/drivers/infiniband/core/smi.h @@ -51,12 +51,12 @@ IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */ }; -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, int port_num, int phys_port_cnt); int smi_get_fwd_port(struct ib_smp *smp); extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num); + bool is_switch, int port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM Index: sys/ofed/drivers/infiniband/core/smi.c =================================================================== --- sys/ofed/drivers/infiniband/core/smi.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. - * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. - * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. - * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include "smi.h" - -/* - * Fixup a directed route SMP for sending - * Return 0 if the SMP should be discarded - */ -enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - u8 node_type, int port_num) -{ - u8 hop_ptr, hop_cnt; - - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; - - /* See section 14.2.2.2, Vol 1 IB spec */ - /* C14-6 -- valid hop_cnt values are from 0 to 63 */ - if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) - return IB_SMI_DISCARD; - - if (!ib_get_smp_direction(smp)) { - /* C14-9:1 */ - if (hop_cnt && hop_ptr == 0) { - smp->hop_ptr++; - return (smp->initial_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:2 */ - if (hop_ptr && hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - /* smp->return_path set when received */ - smp->hop_ptr++; - return (smp->initial_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) { - /* smp->return_path set when received */ - smp->hop_ptr++; - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_dlid == IB_LID_PERMISSIVE ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ - /* C14-9:5 -- Fail unreasonable hop pointer */ - return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); - - } else { - /* C14-13:1 */ - if (hop_cnt && hop_ptr == hop_cnt + 1) { - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:2 */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:3 -- at the end of the DR segment of path */ - if (hop_ptr == 1) { - smp->hop_ptr--; - /* C14-13:3 -- SMPs destined for SM shouldn't be here */ - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_slid == IB_LID_PERMISSIVE ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */ - if (hop_ptr == 0) - return IB_SMI_HANDLE; - - /* C14-13:5 -- Check for unreasonable hop pointer */ - return IB_SMI_DISCARD; - } -} - -/* - * Adjust information for a received SMP - * Return 0 if the SMP should be dropped - */ -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, - int port_num, int phys_port_cnt) -{ - u8 hop_ptr, hop_cnt; - - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; - - /* See section 14.2.2.2, Vol 1 IB spec */ - /* C14-6 -- valid hop_cnt values are from 0 to 63 */ - if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) - return IB_SMI_DISCARD; - - if (!ib_get_smp_direction(smp)) { - /* C14-9:1 -- sender should have incremented hop_ptr */ - if (hop_cnt && hop_ptr == 0) - return IB_SMI_DISCARD; - - /* C14-9:2 -- intermediate hop */ - if (hop_ptr && hop_ptr < hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - smp->return_path[hop_ptr] = port_num; - /* smp->hop_ptr updated when sending */ - return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) { - if (hop_cnt) - smp->return_path[hop_ptr] = port_num; - /* smp->hop_ptr updated when sending */ - - return (node_type == RDMA_NODE_IB_SWITCH || - smp->dr_dlid == IB_LID_PERMISSIVE ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ - /* C14-9:5 -- fail unreasonable hop pointer */ - return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); - - } else { - - /* C14-13:1 */ - if (hop_cnt && hop_ptr == hop_cnt + 1) { - smp->hop_ptr--; - return (smp->return_path[smp->hop_ptr] == - port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:2 */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) { - if (node_type != RDMA_NODE_IB_SWITCH) - return IB_SMI_DISCARD; - - /* smp->hop_ptr updated when sending */ - return (smp->return_path[hop_ptr-1] <= phys_port_cnt ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:3 -- We're at the end of the DR segment of path */ - if (hop_ptr == 1) { - if (smp->dr_slid == IB_LID_PERMISSIVE) { - /* giving SMP to SM - update hop_ptr */ - smp->hop_ptr--; - return IB_SMI_HANDLE; - } - /* smp->hop_ptr updated when sending */ - return (node_type == RDMA_NODE_IB_SWITCH ? - IB_SMI_HANDLE : IB_SMI_DISCARD); - } - - /* C14-13:4 -- hop_ptr = 0 -> give to SM */ - /* C14-13:5 -- Check for unreasonable hop pointer */ - return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); - } -} - -enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) -{ - u8 hop_ptr, hop_cnt; - - hop_ptr = smp->hop_ptr; - hop_cnt = smp->hop_cnt; - - if (!ib_get_smp_direction(smp)) { - /* C14-9:2 -- intermediate hop */ - if (hop_ptr && hop_ptr < hop_cnt) - return IB_SMI_FORWARD; - - /* C14-9:3 -- at the end of the DR segment of path */ - if (hop_ptr == hop_cnt) - return (smp->dr_dlid == IB_LID_PERMISSIVE ? - IB_SMI_SEND : IB_SMI_LOCAL); - - /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ - if (hop_ptr == hop_cnt + 1) - return IB_SMI_SEND; - } else { - /* C14-13:2 -- intermediate hop */ - if (2 <= hop_ptr && hop_ptr <= hop_cnt) - return IB_SMI_FORWARD; - - /* C14-13:3 -- at the end of the DR segment of path */ - if (hop_ptr == 1) - return (smp->dr_slid != IB_LID_PERMISSIVE ? - IB_SMI_SEND : IB_SMI_LOCAL); - } - return IB_SMI_LOCAL; -} - -/* - * Return the forwarding port number from initial_path for outgoing SMP and - * from return_path for returning SMP - */ -int smi_get_fwd_port(struct ib_smp *smp) -{ - return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] : - smp->return_path[smp->hop_ptr-1]); -} Index: sys/ofed/drivers/infiniband/core/sysfs.c =================================================================== --- sys/ofed/drivers/infiniband/core/sysfs.c +++ /dev/null @@ -1,1026 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "core_priv.h" - -#include -#include -#include -#include - -#include -#include - -struct ib_port { - struct kobject kobj; - struct ib_device *ibdev; - struct attribute_group gid_group; - struct attribute_group pkey_group; - u8 port_num; -}; - -struct port_attribute { - struct attribute attr; - ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf); - ssize_t (*store)(struct ib_port *, struct port_attribute *, - const char *buf, size_t count); -}; - -#define PORT_ATTR(_name, _mode, _show, _store) \ -struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store) - -#define PORT_ATTR_RO(_name) \ -struct port_attribute port_attr_##_name = __ATTR_RO(_name) - -struct port_table_attribute { - struct port_attribute attr; - char name[8]; - int index; -}; - -static ssize_t port_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - - if (!port_attr->show) - return -EIO; - - return port_attr->show(p, port_attr, buf); -} - -static const struct sysfs_ops port_sysfs_ops = { - .show = port_attr_show -}; - -static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - static const char *state_name[] = { - [IB_PORT_NOP] = "NOP", - [IB_PORT_DOWN] = "DOWN", - [IB_PORT_INIT] = "INIT", - [IB_PORT_ARMED] = "ARMED", - [IB_PORT_ACTIVE] = "ACTIVE", - [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" - }; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d: %s\n", attr.state, - attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? - state_name[attr.state] : "UNKNOWN"); -} - -static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "0x%x\n", attr.lid); -} - -static ssize_t lid_mask_count_show(struct ib_port *p, - struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d\n", attr.lmc); -} - -static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "0x%x\n", attr.sm_lid); -} - -static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "%d\n", attr.sm_sl); -} - -static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - return sprintf(buf, "0x%08x\n", attr.port_cap_flags); -} - -static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - char *speed = ""; - int rate; /* in deci-Gb/sec */ - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - ib_active_speed_enum_to_rate(attr.active_speed, - &rate, - &speed); - - rate *= ib_width_enum_to_int(attr.active_width); - if (rate < 0) - return -EINVAL; - - return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", - rate / 10, rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); -} - -static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - struct ib_port_attr attr; - - ssize_t ret; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); - if (ret) - return ret; - - switch (attr.phys_state) { - case 1: return sprintf(buf, "1: Sleep\n"); - case 2: return sprintf(buf, "2: Polling\n"); - case 3: return sprintf(buf, "3: Disabled\n"); - case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); - case 5: return sprintf(buf, "5: LinkUp\n"); - case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); - case 7: return sprintf(buf, "7: Phy Test\n"); - default: return sprintf(buf, "%d: \n", attr.phys_state); - } -} - -static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, - char *buf) -{ - switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { - case IB_LINK_LAYER_INFINIBAND: - return sprintf(buf, "%s\n", "InfiniBand"); - case IB_LINK_LAYER_ETHERNET: - return sprintf(buf, "%s\n", "Ethernet"); - case IB_LINK_LAYER_SCIF: - return sprintf(buf, "%s\n", "SCIF"); - default: - return sprintf(buf, "%s\n", "Unknown"); - } -} - -static PORT_ATTR_RO(state); -static PORT_ATTR_RO(lid); -static PORT_ATTR_RO(lid_mask_count); -static PORT_ATTR_RO(sm_lid); -static PORT_ATTR_RO(sm_sl); -static PORT_ATTR_RO(cap_mask); -static PORT_ATTR_RO(rate); -static PORT_ATTR_RO(phys_state); -static PORT_ATTR_RO(link_layer); - -static struct attribute *port_default_attrs[] = { - &port_attr_state.attr, - &port_attr_lid.attr, - &port_attr_lid_mask_count.attr, - &port_attr_sm_lid.attr, - &port_attr_sm_sl.attr, - &port_attr_cap_mask.attr, - &port_attr_rate.attr, - &port_attr_phys_state.attr, - &port_attr_link_layer.attr, - NULL -}; - -static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, - char *buf) -{ - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - union ib_gid gid; - ssize_t ret; - - ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); - if (ret) - return ret; - - return sprintf(buf, GID_PRINT_FMT"\n",GID_PRINT_ARGS(gid.raw)); -} - -static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, - char *buf) -{ - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - u16 pkey; - ssize_t ret; - - ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); - if (ret) - return ret; - - return sprintf(buf, "0x%04x\n", pkey); -} - -static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr, - char *buf, int c_ext) -{ - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - int offset = tab_attr->index & 0xffff; - int width = (tab_attr->index >> 16) & 0xff; - struct ib_mad *in_mad = NULL; - struct ib_mad *out_mad = NULL; - ssize_t ret; - - if (!p->ibdev->process_mad) - return -ENXIO; - - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) { - ret = -ENOMEM; - goto out; - } - - in_mad->mad_hdr.base_version = 1; - in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; - in_mad->mad_hdr.class_version = 1; - in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - if (c_ext) - in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT; - else - in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS; - - in_mad->data[41] = p->port_num; /* PortSelect field */ - - if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, in_mad, out_mad) & - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { - ret = -EINVAL; - goto out; - } - - switch (width) { - case 4: - ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> - (4 - (offset % 8))) & 0xf); - break; - case 8: - ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); - break; - case 16: - ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); - break; - case 32: - ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); - break; - case 64: - ret = sprintf(buf, "%llu\n", - (unsigned long long)be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8))); - break; - default: - ret = 0; - } - -out: - kfree(in_mad); - kfree(out_mad); - - return ret; -} - -#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ -} - -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) -{ - return get_pma_counters(p, attr, buf, 0); -} - -static PORT_PMA_ATTR(symbol_error , 0, 16, 32); -static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); -static PORT_PMA_ATTR(link_downed , 2, 8, 56); -static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); -static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); -static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); -static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); -static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); -static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); -static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); -static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); -static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); -static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); -static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); -static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); -static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); - -static struct attribute *pma_attrs[] = { - &port_pma_attr_symbol_error.attr.attr, - &port_pma_attr_link_error_recovery.attr.attr, - &port_pma_attr_link_downed.attr.attr, - &port_pma_attr_port_rcv_errors.attr.attr, - &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, - &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, - &port_pma_attr_port_xmit_discards.attr.attr, - &port_pma_attr_port_xmit_constraint_errors.attr.attr, - &port_pma_attr_port_rcv_constraint_errors.attr.attr, - &port_pma_attr_local_link_integrity_errors.attr.attr, - &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, - &port_pma_attr_VL15_dropped.attr.attr, - &port_pma_attr_port_xmit_data.attr.attr, - &port_pma_attr_port_rcv_data.attr.attr, - &port_pma_attr_port_xmit_packets.attr.attr, - &port_pma_attr_port_rcv_packets.attr.attr, - NULL -}; - -static struct attribute_group pma_group = { - .name = "counters", - .attrs = pma_attrs -}; - -#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_ext_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ -} - -static ssize_t show_pma_counter_ext(struct ib_port *p, - struct port_attribute *attr, char *buf) -{ - return get_pma_counters(p, attr, buf, 1); -} - -static PORT_PMA_ATTR_EXT(port_xmit_data_64 , 0, 64, 64); -static PORT_PMA_ATTR_EXT(port_rcv_data_64 , 0, 64, 128); -static PORT_PMA_ATTR_EXT(port_xmit_packets_64 , 0, 64, 192); -static PORT_PMA_ATTR_EXT(port_rcv_packets_64 , 0, 64, 256); -static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets , 0, 64, 320); -static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets , 0, 64, 384); -static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets , 0, 64, 448); -static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets , 0, 64, 512); - -static struct attribute *pma_attrs_ext[] = { - &port_pma_attr_ext_port_xmit_data_64.attr.attr, - &port_pma_attr_ext_port_rcv_data_64.attr.attr, - &port_pma_attr_ext_port_xmit_packets_64.attr.attr, - &port_pma_attr_ext_port_rcv_packets_64.attr.attr, - &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr, - &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr, - &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr, - &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr, - NULL -}; - -static struct attribute_group pma_ext_group = { - .name = "counters_ext", - .attrs = pma_attrs_ext -}; - -static void ib_port_release(struct kobject *kobj) -{ - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - struct attribute *a; - int i; - - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); - - kfree(p->gid_group.attrs); - - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); - - kfree(p->pkey_group.attrs); - - kfree(p); -} - -static struct kobj_type port_type = { - .release = ib_port_release, - .sysfs_ops = &port_sysfs_ops, - .default_attrs = port_default_attrs -}; - -static void ib_device_release(struct device *device) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - kfree(dev); -} - -#ifdef __linux__ -/* BSD supports this through devfs(5) and devd(8). */ -static int ib_device_uevent(struct device *device, - struct kobj_uevent_env *env) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - if (add_uevent_var(env, "NAME=%s", dev->name)) - return -ENOMEM; - - /* - * It would be nice to pass the node GUID with the event... - */ - - return 0; -} -#endif - -static struct attribute ** -alloc_group_attrs(ssize_t (*show)(struct ib_port *, - struct port_attribute *, char *buf), - int len) -{ - struct attribute **tab_attr; - struct port_table_attribute *element; - int i; - - tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); - if (!tab_attr) - return NULL; - - for (i = 0; i < len; i++) { - element = kzalloc(sizeof(struct port_table_attribute), - GFP_KERNEL); - if (!element) - goto err; - - if (snprintf(element->name, sizeof(element->name), - "%d", i) >= sizeof(element->name)) { - kfree(element); - goto err; - } - - element->attr.attr.name = element->name; - element->attr.attr.mode = S_IRUGO; - element->attr.show = show; - element->index = i; - sysfs_attr_init(&element->attr.attr); - - tab_attr[i] = &element->attr.attr; - } - - return tab_attr; - -err: - while (--i >= 0) - kfree(tab_attr[i]); - kfree(tab_attr); - return NULL; -} - -static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) -{ - struct ib_port *p; - struct ib_port_attr attr; - int i; - int ret; - - ret = ib_query_port(device, port_num, &attr); - if (ret) - return ret; - - p = kzalloc(sizeof *p, GFP_KERNEL); - if (!p) - return -ENOMEM; - - p->ibdev = device; - p->port_num = port_num; - - ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, - "%d", port_num); - if (ret) - goto err_put; - - ret = sysfs_create_group(&p->kobj, &pma_group); - if (ret) - goto err_put; - - ret = sysfs_create_group(&p->kobj, &pma_ext_group); - if (ret) - goto err_remove_pma; - - p->gid_group.name = "gids"; - p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); - if (!p->gid_group.attrs) - goto err_remove_pma_ext; - - ret = sysfs_create_group(&p->kobj, &p->gid_group); - if (ret) - goto err_free_gid; - - p->pkey_group.name = "pkeys"; - p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, - attr.pkey_tbl_len); - if (!p->pkey_group.attrs) - goto err_remove_gid; - - ret = sysfs_create_group(&p->kobj, &p->pkey_group); - if (ret) - goto err_free_pkey; - - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); - if (ret) - goto err_remove_pkey; - } - - list_add_tail(&p->kobj.entry, &device->port_list); -#ifdef __linux__ - kobject_uevent(&p->kobj, KOBJ_ADD); -#endif - return 0; - -err_remove_pkey: - sysfs_remove_group(&p->kobj, &p->pkey_group); - -err_free_pkey: - for (i = 0; i < attr.pkey_tbl_len; ++i) - kfree(p->pkey_group.attrs[i]); - - kfree(p->pkey_group.attrs); - -err_remove_gid: - sysfs_remove_group(&p->kobj, &p->gid_group); - -err_free_gid: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_group.attrs[i]); - - kfree(p->gid_group.attrs); - -err_remove_pma_ext: - sysfs_remove_group(&p->kobj, &pma_ext_group); - -err_remove_pma: - sysfs_remove_group(&p->kobj, &pma_group); - -err_put: - kobject_put(device->ports_parent); - kfree(p); - return ret; -} - -static ssize_t show_node_type(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - switch (dev->node_type) { - case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); - case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); - case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); - case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); - case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type); - default: return sprintf(buf, "%d: \n", dev->node_type); - } -} - -static ssize_t show_sys_image_guid(struct device *device, - struct device_attribute *dev_attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - struct ib_device_attr attr; - ssize_t ret; - - ret = ib_query_device(dev, &attr); - if (ret) - return ret; - - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); -} - -static ssize_t show_node_guid(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &dev->node_guid)[0]), - be16_to_cpu(((__be16 *) &dev->node_guid)[1]), - be16_to_cpu(((__be16 *) &dev->node_guid)[2]), - be16_to_cpu(((__be16 *) &dev->node_guid)[3])); -} - -static ssize_t show_node_desc(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%.64s\n", dev->node_desc); -} - -static ssize_t set_node_desc(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - struct ib_device_modify desc = {}; - int ret; - - if (!dev->modify_device) - return -EIO; - - memcpy(desc.node_desc, buf, min_t(int, count, 64)); - ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); - if (ret) - return ret; - - return count; -} - -static ssize_t show_cmd_perf(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%d\n", dev->cmd_perf); -} - -static ssize_t set_cmd_perf(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - u32 val; - - if (sscanf(buf, "0x%x", &val) != 1) - return -EINVAL; - - dev->cmd_perf = val; - - return count; -} - -static ssize_t show_cmd_avg(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%llu\n", (unsigned long long)dev->cmd_avg); -} - -static ssize_t set_cmd_avg(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - spin_lock(&dev->cmd_perf_lock); - dev->cmd_avg = 0; - dev->cmd_n = 0; - spin_unlock(&dev->cmd_perf_lock); - - return count; -} - -static ssize_t show_cmd_n(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%d\n", dev->cmd_n); -} - -static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); -static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); -static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(cmd_perf, S_IRUGO | S_IWUSR, show_cmd_perf, set_cmd_perf); -static DEVICE_ATTR(cmd_avg, S_IRUGO | S_IWUSR, show_cmd_avg, set_cmd_avg); -static DEVICE_ATTR(cmd_n, S_IRUGO, show_cmd_n, NULL); - -static struct device_attribute *ib_class_attributes[] = { - &dev_attr_node_type, - &dev_attr_sys_image_guid, - &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_cmd_perf, - &dev_attr_cmd_avg, - &dev_attr_cmd_n, -}; - -static struct class ib_class = { - .name = "infiniband", - .dev_release = ib_device_release, -#ifdef __linux__ - .dev_uevent = ib_device_uevent, -#endif -}; - -/* Show a given an attribute in the statistics group */ -static ssize_t show_protocol_stat(const struct device *device, - struct device_attribute *attr, char *buf, - unsigned offset) -{ - struct ib_device *dev = container_of(__DECONST(struct device *, device), struct ib_device, dev); - union rdma_protocol_stats stats; - ssize_t ret; - - ret = dev->get_protocol_stats(dev, &stats); - if (ret) - return ret; - - return sprintf(buf, "%llu\n", - (unsigned long long) ((u64 *) &stats)[offset]); -} - -/* generate a read-only iwarp statistics attribute */ -#define IW_STATS_ENTRY(name) \ -static ssize_t show_##name(struct device *device, \ - struct device_attribute *attr, char *buf) \ -{ \ - return show_protocol_stat(device, attr, buf, \ - offsetof(struct iw_protocol_stats, name) / \ - sizeof (u64)); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) - -IW_STATS_ENTRY(ipInReceives); -IW_STATS_ENTRY(ipInHdrErrors); -IW_STATS_ENTRY(ipInTooBigErrors); -IW_STATS_ENTRY(ipInNoRoutes); -IW_STATS_ENTRY(ipInAddrErrors); -IW_STATS_ENTRY(ipInUnknownProtos); -IW_STATS_ENTRY(ipInTruncatedPkts); -IW_STATS_ENTRY(ipInDiscards); -IW_STATS_ENTRY(ipInDelivers); -IW_STATS_ENTRY(ipOutForwDatagrams); -IW_STATS_ENTRY(ipOutRequests); -IW_STATS_ENTRY(ipOutDiscards); -IW_STATS_ENTRY(ipOutNoRoutes); -IW_STATS_ENTRY(ipReasmTimeout); -IW_STATS_ENTRY(ipReasmReqds); -IW_STATS_ENTRY(ipReasmOKs); -IW_STATS_ENTRY(ipReasmFails); -IW_STATS_ENTRY(ipFragOKs); -IW_STATS_ENTRY(ipFragFails); -IW_STATS_ENTRY(ipFragCreates); -IW_STATS_ENTRY(ipInMcastPkts); -IW_STATS_ENTRY(ipOutMcastPkts); -IW_STATS_ENTRY(ipInBcastPkts); -IW_STATS_ENTRY(ipOutBcastPkts); -IW_STATS_ENTRY(tcpRtoAlgorithm); -IW_STATS_ENTRY(tcpRtoMin); -IW_STATS_ENTRY(tcpRtoMax); -IW_STATS_ENTRY(tcpMaxConn); -IW_STATS_ENTRY(tcpActiveOpens); -IW_STATS_ENTRY(tcpPassiveOpens); -IW_STATS_ENTRY(tcpAttemptFails); -IW_STATS_ENTRY(tcpEstabResets); -IW_STATS_ENTRY(tcpCurrEstab); -IW_STATS_ENTRY(tcpInSegs); -IW_STATS_ENTRY(tcpOutSegs); -IW_STATS_ENTRY(tcpRetransSegs); -IW_STATS_ENTRY(tcpInErrs); -IW_STATS_ENTRY(tcpOutRsts); - -static struct attribute *iw_proto_stats_attrs[] = { - &dev_attr_ipInReceives.attr, - &dev_attr_ipInHdrErrors.attr, - &dev_attr_ipInTooBigErrors.attr, - &dev_attr_ipInNoRoutes.attr, - &dev_attr_ipInAddrErrors.attr, - &dev_attr_ipInUnknownProtos.attr, - &dev_attr_ipInTruncatedPkts.attr, - &dev_attr_ipInDiscards.attr, - &dev_attr_ipInDelivers.attr, - &dev_attr_ipOutForwDatagrams.attr, - &dev_attr_ipOutRequests.attr, - &dev_attr_ipOutDiscards.attr, - &dev_attr_ipOutNoRoutes.attr, - &dev_attr_ipReasmTimeout.attr, - &dev_attr_ipReasmReqds.attr, - &dev_attr_ipReasmOKs.attr, - &dev_attr_ipReasmFails.attr, - &dev_attr_ipFragOKs.attr, - &dev_attr_ipFragFails.attr, - &dev_attr_ipFragCreates.attr, - &dev_attr_ipInMcastPkts.attr, - &dev_attr_ipOutMcastPkts.attr, - &dev_attr_ipInBcastPkts.attr, - &dev_attr_ipOutBcastPkts.attr, - &dev_attr_tcpRtoAlgorithm.attr, - &dev_attr_tcpRtoMin.attr, - &dev_attr_tcpRtoMax.attr, - &dev_attr_tcpMaxConn.attr, - &dev_attr_tcpActiveOpens.attr, - &dev_attr_tcpPassiveOpens.attr, - &dev_attr_tcpAttemptFails.attr, - &dev_attr_tcpEstabResets.attr, - &dev_attr_tcpCurrEstab.attr, - &dev_attr_tcpInSegs.attr, - &dev_attr_tcpOutSegs.attr, - &dev_attr_tcpRetransSegs.attr, - &dev_attr_tcpInErrs.attr, - &dev_attr_tcpOutRsts.attr, - NULL -}; - -static struct attribute_group iw_stats_group = { - .name = "proto_stats", - .attrs = iw_proto_stats_attrs, -}; - -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) -{ - struct device *class_dev = &device->dev; - int ret; - int i; - - class_dev->class = &ib_class; - class_dev->parent = device->dma_device; - dev_set_name(class_dev, device->name); - dev_set_drvdata(class_dev, device); - - INIT_LIST_HEAD(&device->port_list); - - ret = device_register(class_dev); - if (ret) - goto err; - - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - ret = device_create_file(class_dev, ib_class_attributes[i]); - if (ret) - goto err_unregister; - } - - device->ports_parent = kobject_create_and_add("ports",&class_dev->kobj); - if (!device->ports_parent) { - ret = -ENOMEM; - goto err_put; - } - - if (device->node_type == RDMA_NODE_IB_SWITCH) { - ret = add_port(device, 0, port_callback); - if (ret) - goto err_put; - } else { - for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i, port_callback); - if (ret) - goto err_put; - } - } - - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) { - ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group); - if (ret) - goto err_put; - } - - return 0; - -err_put: - { - struct kobject *p, *t; - struct ib_port *port; - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - } - - kobject_put(&class_dev->kobj); - -err_unregister: - - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - device_remove_file(class_dev, ib_class_attributes[i]); - } - - device_unregister(class_dev); - -err: - return ret; -} - -void ib_device_unregister_sysfs(struct ib_device *device) -{ - int i; - struct kobject *p, *t; - struct ib_port *port; - struct device *class_dev = &device->dev; - - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); - - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - device_remove_file(class_dev, ib_class_attributes[i]); - } - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - - kobject_put(device->ports_parent); - device_unregister(&device->dev); -} - -int ib_sysfs_setup(void) -{ - return class_register(&ib_class); -} - -void ib_sysfs_cleanup(void) -{ - class_unregister(&ib_class); -} Index: sys/ofed/drivers/infiniband/core/umem.c =================================================================== --- sys/ofed/drivers/infiniband/core/umem.c +++ /dev/null @@ -1,445 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Cisco Systems. All rights reserved. - * Copyright (c) 2005 Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#define LINUXKPI_PARAM_PREFIX ibcore_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "uverbs.h" - -#define IB_UMEM_MAX_PAGE_CHUNK (PAGE_SIZE / sizeof (struct page *)) - -static int allow_weak_ordering; -module_param_named(weak_ordering, allow_weak_ordering, int, 0444); -MODULE_PARM_DESC(weak_ordering, "Allow weak ordering for data registered memory"); - -static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem, - struct ib_umem *umem, unsigned long addr, - int dmasync, int invalidation_supported) -{ - int ret; - const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; - struct invalidation_ctx *invalidation_ctx = NULL; - - umem->ib_peer_mem = ib_peer_mem; - if (invalidation_supported) { - invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL); - if (!invalidation_ctx) { - ret = -ENOMEM; - goto out; - } - umem->invalidation_ctx = invalidation_ctx; - invalidation_ctx->umem = umem; - mutex_lock(&ib_peer_mem->lock); - invalidation_ctx->context_ticket = - ib_peer_insert_context(ib_peer_mem, invalidation_ctx); - /* unlock before calling get pages to prevent a dead-lock from the callback */ - mutex_unlock(&ib_peer_mem->lock); - } - - ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1, - &umem->sg_head, - umem->peer_mem_client_context, - invalidation_ctx ? - (void *)invalidation_ctx->context_ticket : NULL); - - if (invalidation_ctx) { - /* taking the lock back, checking that wasn't invalidated at that time */ - mutex_lock(&ib_peer_mem->lock); - if (invalidation_ctx->peer_invalidated) { - printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n"); - ret = -EINVAL; - } - } - - if (ret) - goto out; - - umem->page_size = peer_mem->get_page_size - (umem->peer_mem_client_context); - if (umem->page_size <= 0) - goto put_pages; - - umem->offset = addr & ((unsigned long)umem->page_size - 1); - ret = peer_mem->dma_map(&umem->sg_head, - umem->peer_mem_client_context, - umem->context->device->dma_device, - dmasync, - &umem->nmap); - if (ret) - goto put_pages; - - ib_peer_mem->stats.num_reg_pages += - umem->nmap * (umem->page_size >> PAGE_SHIFT); - ib_peer_mem->stats.num_alloc_mrs += 1; - return umem; - -put_pages: - - peer_mem->put_pages(umem->peer_mem_client_context, - &umem->sg_head); -out: - if (invalidation_ctx) { - ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); - mutex_unlock(&umem->ib_peer_mem->lock); - kfree(invalidation_ctx); - } - - ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, - umem->peer_mem_srcu_key); - kfree(umem); - return ERR_PTR(ret); -} - -static void peer_umem_release(struct ib_umem *umem) -{ - struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem; - const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem; - struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; - - if (invalidation_ctx) { - - int peer_callback; - int inflight_invalidation; - /* If we are not under peer callback we must take the lock before removing - * core ticket from the tree and releasing its umem. - * It will let any inflight callbacks to be ended safely. - * If we are under peer callback or under error flow of reg_mr so that context - * wasn't activated yet lock was already taken. - */ - if (invalidation_ctx->func && !invalidation_ctx->peer_callback) - mutex_lock(&ib_peer_mem->lock); - ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); - /* make sure to check inflight flag after took the lock and remove from tree. - * in addition, from that point using local variables for peer_callback and - * inflight_invalidation as after the complete invalidation_ctx can't be accessed - * any more as it may be freed by the callback. - */ - peer_callback = invalidation_ctx->peer_callback; - inflight_invalidation = invalidation_ctx->inflight_invalidation; - if (inflight_invalidation) - complete(&invalidation_ctx->comp); - /* On peer callback lock is handled externally */ - if (!peer_callback) - /* unlocking before put_pages */ - mutex_unlock(&ib_peer_mem->lock); - /* in case under callback context or callback is pending let it free the invalidation context */ - if (!peer_callback && !inflight_invalidation) - kfree(invalidation_ctx); - } - - peer_mem->dma_unmap(&umem->sg_head, - umem->peer_mem_client_context, - umem->context->device->dma_device); - peer_mem->put_pages(&umem->sg_head, - umem->peer_mem_client_context); - - ib_peer_mem->stats.num_dereg_pages += - umem->nmap * (umem->page_size >> PAGE_SHIFT); - ib_peer_mem->stats.num_dealloc_mrs += 1; - ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, - umem->peer_mem_srcu_key); - kfree(umem); - - return; - -} - -static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) -{ - - vm_object_t object; - struct scatterlist *sg; - struct page *page; - int i; - - object = NULL; - if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->nmap, - DMA_BIDIRECTIONAL); - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - page = sg_page(sg); - if (umem->writable && dirty) { - if (object && object != page->object) - VM_OBJECT_WUNLOCK(object); - if (object != page->object) { - object = page->object; - VM_OBJECT_WLOCK(object); - } - vm_page_dirty(page); - } - } - sg_free_table(&umem->sg_head); - if (object) - VM_OBJECT_WUNLOCK(object); - -} - -void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, - umem_invalidate_func_t func, - void *cookie) -{ - struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx; - - invalidation_ctx->func = func; - invalidation_ctx->cookie = cookie; - - /* from that point any pending invalidations can be called */ - mutex_unlock(&umem->ib_peer_mem->lock); - return; -} -EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); -/** - * ib_umem_get - Pin and DMA map userspace memory. - * @context: userspace context to pin memory for - * @addr: userspace virtual address to start at - * @size: length of region to pin - * @access: IB_ACCESS_xxx flags for memory being pinned - * @dmasync: flush in-flight DMA when the memory region is written - */ -struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync, - int invalidation_supported) -{ - - struct ib_umem *umem; - struct proc *proc; - pmap_t pmap; - vm_offset_t end, last, start; - vm_size_t npages; - int error; - int ret; - int ents; - int i; - DEFINE_DMA_ATTRS(attrs); - struct scatterlist *sg, *sg_list_start; - int need_release = 0; - - error = priv_check(curthread, PRIV_VM_MLOCK); - if (error) - return ERR_PTR(-error); - - last = addr + size; - start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ - end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ - if (last < addr || end < addr) - return ERR_PTR(-EINVAL); - npages = atop(end - start); - if (npages > vm_page_max_wired) - return ERR_PTR(-ENOMEM); - umem = kzalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - proc = curthread->td_proc; - PROC_LOCK(proc); - if (ptoa(npages + - pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > - lim_cur_proc(proc, RLIMIT_MEMLOCK)) { - PROC_UNLOCK(proc); - kfree(umem); - return ERR_PTR(-ENOMEM); - } - PROC_UNLOCK(proc); - if (npages + vm_cnt.v_wire_count > vm_page_max_wired) { - kfree(umem); - return ERR_PTR(-EAGAIN); - } - error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | - (umem->writable ? VM_MAP_WIRE_WRITE : 0)); - if (error != KERN_SUCCESS) { - kfree(umem); - return ERR_PTR(-ENOMEM); - } - - umem->context = context; - umem->length = size; - umem->offset = addr & ~PAGE_MASK; - umem->page_size = PAGE_SIZE; - umem->start = addr; - /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" - * obviously require write access. "Remote atomic" can do - * things like fetch and add, which will modify memory, and - * "MW bind" can change permissions by binding a window. - */ - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); - - if (invalidation_supported || context->peer_mem_private_data) { - - struct ib_peer_memory_client *peer_mem_client; - - peer_mem_client = ib_get_peer_client(context, addr, size, - &umem->peer_mem_client_context, - &umem->peer_mem_srcu_key); - if (peer_mem_client) - return peer_umem_get(peer_mem_client, umem, addr, - dmasync, invalidation_supported); - } - - umem->hugetlb = 0; - - pmap = vm_map_pmap(&proc->p_vmspace->vm_map); - - if (npages == 0) { - ret = -EINVAL; - goto out; - } - - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); - if (ret) - goto out; - - need_release = 1; - sg_list_start = umem->sg_head.sgl; - - while (npages) { - - ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); - umem->npages += ents; - - for_each_sg(sg_list_start, sg, ents, i) { - vm_paddr_t pa; - - pa = pmap_extract(pmap, start); - if (pa == 0) { - ret = -ENOMEM; - goto out; - } - sg_set_page(sg, PHYS_TO_VM_PAGE(pa), - PAGE_SIZE, 0); - npages--; - start += PAGE_SIZE; - } - - /* preparing for next loop */ - sg_list_start = sg; - } - - umem->nmap = ib_dma_map_sg_attrs(context->device, - umem->sg_head.sgl, - umem->npages, - DMA_BIDIRECTIONAL, - &attrs); - if (umem->nmap != umem->npages) { - ret = -ENOMEM; - goto out; - } - -out: - if (ret < 0) { - if (need_release) - __ib_umem_release(context->device, umem, 0); - kfree(umem); - } - - return ret < 0 ? ERR_PTR(ret) : umem; -} -EXPORT_SYMBOL(ib_umem_get_ex); - -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync) -{ - return ib_umem_get_ex(context, addr, - size, access, dmasync, 0); -} -EXPORT_SYMBOL(ib_umem_get); - -/** - * ib_umem_release - release memory pinned with ib_umem_get - * @umem: umem struct to release - */ -void ib_umem_release(struct ib_umem *umem) -{ - - vm_offset_t addr, end, last, start; - vm_size_t size; - int error; - - if (umem->ib_peer_mem) { - peer_umem_release(umem); - return; - } - - __ib_umem_release(umem->context->device, umem, 1); - - if (umem->context->closing) { - kfree(umem); - return; - } - - error = priv_check(curthread, PRIV_VM_MUNLOCK); - - if (error) - return; - - addr = umem->start; - size = umem->length; - last = addr + size; - start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ - end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ - vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); - kfree(umem); - -} -EXPORT_SYMBOL(ib_umem_release); - -int ib_umem_page_count(struct ib_umem *umem) -{ - int shift; - int i; - int n; - struct scatterlist *sg; - - shift = ilog2(umem->page_size); - - n = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> shift; - - return n; -} -EXPORT_SYMBOL(ib_umem_page_count); Index: sys/ofed/drivers/infiniband/core/uverbs.h =================================================================== --- sys/ofed/drivers/infiniband/core/uverbs.h +++ sys/ofed/drivers/infiniband/core/uverbs.h @@ -42,13 +42,29 @@ #include #include #include +#include +#include #include #include -#include #include #include -#include + +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (const void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) /* * Our lifetime rules for these structs are the following: @@ -72,20 +88,24 @@ */ struct ib_uverbs_device { - struct kref ref; + atomic_t refcount; int num_comp_vectors; struct completion comp; struct device *dev; - struct ib_device *ib_dev; + struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; + struct kobject kobj; + struct srcu_struct disassociate_srcu; + struct mutex lists_mutex; /* protect lists */ + struct list_head uverbs_file_list; + struct list_head uverbs_events_file_list; }; struct ib_uverbs_event_file { struct kref ref; - struct file *filp; int is_async; struct ib_uverbs_file *uverbs_file; spinlock_t lock; @@ -93,15 +113,19 @@ wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; + struct list_head list; }; struct ib_uverbs_file { struct kref ref; struct mutex mutex; + struct mutex cleanup_mutex; /* protect cleanup */ struct ib_uverbs_device *device; struct ib_ucontext *ucontext; struct ib_event_handler event_handler; struct ib_uverbs_event_file *async_file; + struct list_head list; + int is_closed; }; struct ib_uverbs_event { @@ -142,6 +166,10 @@ struct ib_uxrcd_object *uxrcd; }; +struct ib_uwq_object { + struct ib_uevent_object uevent; +}; + struct ib_ucq_object { struct ib_uobject uobject; struct ib_uverbs_file *uverbs_file; @@ -151,10 +179,6 @@ u32 async_events_reported; }; -struct ib_udct_object { - struct ib_uobject uobject; -}; - extern spinlock_t ib_uverbs_idr_lock; extern struct idr ib_uverbs_pd_idr; extern struct idr ib_uverbs_mr_idr; @@ -165,12 +189,15 @@ extern struct idr ib_uverbs_srq_idr; extern struct idr ib_uverbs_xrcd_idr; extern struct idr ib_uverbs_rule_idr; -extern struct idr ib_uverbs_dct_idr; +extern struct idr ib_uverbs_wq_idr; +extern struct idr ib_uverbs_rwq_ind_tbl_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev, int is_async); +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file); struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); void ib_uverbs_release_ucq(struct ib_uverbs_file *file, @@ -182,11 +209,14 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +int uverbs_dealloc_mw(struct ib_mw *mw); + struct ib_uverbs_flow_spec { union { union { @@ -198,14 +228,15 @@ }; }; struct ib_uverbs_flow_spec_eth eth; - struct ib_uverbs_flow_spec_ib ib; struct ib_uverbs_flow_spec_ipv4 ipv4; struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_ipv6 ipv6; }; }; #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ + struct ib_device *ib_dev, \ const char __user *buf, int in_len, \ int out_len) @@ -215,6 +246,7 @@ IB_UVERBS_DECLARE_CMD(alloc_pd); IB_UVERBS_DECLARE_CMD(dealloc_pd); IB_UVERBS_DECLARE_CMD(reg_mr); +IB_UVERBS_DECLARE_CMD(rereg_mr); IB_UVERBS_DECLARE_CMD(dereg_mr); IB_UVERBS_DECLARE_CMD(alloc_mw); IB_UVERBS_DECLARE_CMD(dealloc_mw); @@ -245,25 +277,20 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); #define IB_UVERBS_DECLARE_EX_CMD(name) \ - int ib_uverbs_ex_##name(struct ib_uverbs_file *file,\ - struct ib_udata *ucore, \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_device *ib_dev, \ + struct ib_udata *ucore, \ struct ib_udata *uhw) -#define IB_UVERBS_DECLARE_EXP_CMD(name) \ - ssize_t ib_uverbs_exp_##name(struct ib_uverbs_file *file, \ - struct ib_udata *ucore, \ - struct ib_udata *uhw) - IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); - -IB_UVERBS_DECLARE_EXP_CMD(create_qp); -IB_UVERBS_DECLARE_EXP_CMD(modify_cq); -IB_UVERBS_DECLARE_EXP_CMD(modify_qp); -IB_UVERBS_DECLARE_EXP_CMD(create_cq); -IB_UVERBS_DECLARE_EXP_CMD(query_device); -IB_UVERBS_DECLARE_EXP_CMD(create_dct); -IB_UVERBS_DECLARE_EXP_CMD(destroy_dct); -IB_UVERBS_DECLARE_EXP_CMD(query_dct); +IB_UVERBS_DECLARE_EX_CMD(query_device); +IB_UVERBS_DECLARE_EX_CMD(create_cq); +IB_UVERBS_DECLARE_EX_CMD(create_qp); +IB_UVERBS_DECLARE_EX_CMD(create_wq); +IB_UVERBS_DECLARE_EX_CMD(modify_wq); +IB_UVERBS_DECLARE_EX_CMD(destroy_wq); +IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table); +IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table); #endif /* UVERBS_H */ Index: sys/ofed/drivers/infiniband/core/verbs.c =================================================================== --- sys/ofed/drivers/infiniband/core/verbs.c +++ /dev/null @@ -1,1538 +0,0 @@ -/* - * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2004 Infinicon Corporation. All rights reserved. - * Copyright (c) 2004 Intel Corporation. All rights reserved. - * Copyright (c) 2004 Topspin Corporation. All rights reserved. - * Copyright (c) 2004 Voltaire Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -int ib_rate_to_mult(enum ib_rate rate) -{ - switch (rate) { - case IB_RATE_2_5_GBPS: return 1; - case IB_RATE_5_GBPS: return 2; - case IB_RATE_10_GBPS: return 4; - case IB_RATE_20_GBPS: return 8; - case IB_RATE_30_GBPS: return 12; - case IB_RATE_40_GBPS: return 16; - case IB_RATE_60_GBPS: return 24; - case IB_RATE_80_GBPS: return 32; - case IB_RATE_120_GBPS: return 48; - default: return -1; - } -} -EXPORT_SYMBOL(ib_rate_to_mult); - -enum ib_rate mult_to_ib_rate(int mult) -{ - switch (mult) { - case 1: return IB_RATE_2_5_GBPS; - case 2: return IB_RATE_5_GBPS; - case 4: return IB_RATE_10_GBPS; - case 8: return IB_RATE_20_GBPS; - case 12: return IB_RATE_30_GBPS; - case 16: return IB_RATE_40_GBPS; - case 24: return IB_RATE_60_GBPS; - case 32: return IB_RATE_80_GBPS; - case 48: return IB_RATE_120_GBPS; - default: return IB_RATE_PORT_CURRENT; - } -} -EXPORT_SYMBOL(mult_to_ib_rate); - -int ib_rate_to_mbps(enum ib_rate rate) -{ - switch (rate) { - case IB_RATE_2_5_GBPS: return 2500; - case IB_RATE_5_GBPS: return 5000; - case IB_RATE_10_GBPS: return 10000; - case IB_RATE_20_GBPS: return 20000; - case IB_RATE_30_GBPS: return 30000; - case IB_RATE_40_GBPS: return 40000; - case IB_RATE_60_GBPS: return 60000; - case IB_RATE_80_GBPS: return 80000; - case IB_RATE_120_GBPS: return 120000; - case IB_RATE_14_GBPS: return 14062; - case IB_RATE_56_GBPS: return 56250; - case IB_RATE_112_GBPS: return 112500; - case IB_RATE_168_GBPS: return 168750; - case IB_RATE_25_GBPS: return 25781; - case IB_RATE_100_GBPS: return 103125; - case IB_RATE_200_GBPS: return 206250; - case IB_RATE_300_GBPS: return 309375; - default: return -1; - } -} -EXPORT_SYMBOL(ib_rate_to_mbps); - -enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) -{ - switch (node_type) { - case RDMA_NODE_IB_CA: - case RDMA_NODE_IB_SWITCH: - case RDMA_NODE_IB_ROUTER: - return RDMA_TRANSPORT_IB; - case RDMA_NODE_RNIC: - return RDMA_TRANSPORT_IWARP; - case RDMA_NODE_MIC: - return RDMA_TRANSPORT_SCIF; - default: - BUG(); - return 0; - } -} -EXPORT_SYMBOL(rdma_node_get_transport); - -enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) -{ - if (device->get_link_layer) - return device->get_link_layer(device, port_num); - - switch (rdma_node_get_transport(device->node_type)) { - case RDMA_TRANSPORT_IB: - return IB_LINK_LAYER_INFINIBAND; - case RDMA_TRANSPORT_IWARP: - return IB_LINK_LAYER_ETHERNET; - case RDMA_TRANSPORT_SCIF: - return IB_LINK_LAYER_SCIF; - default: - return IB_LINK_LAYER_UNSPECIFIED; - } -} -EXPORT_SYMBOL(rdma_port_get_link_layer); - -/* Protection domains */ - -struct ib_pd *ib_alloc_pd(struct ib_device *device) -{ - struct ib_pd *pd; - - pd = device->alloc_pd(device, NULL, NULL); - - if (!IS_ERR(pd)) { - pd->device = device; - pd->uobject = NULL; - atomic_set(&pd->usecnt, 0); - } - - return pd; -} -EXPORT_SYMBOL(ib_alloc_pd); - -int ib_dealloc_pd(struct ib_pd *pd) -{ - if (atomic_read(&pd->usecnt)) - return -EBUSY; - - return pd->device->dealloc_pd(pd); -} -EXPORT_SYMBOL(ib_dealloc_pd); - -/* Address handles */ - -struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) -{ - struct ib_ah *ah; - - ah = pd->device->create_ah(pd, ah_attr); - - if (!IS_ERR(ah)) { - ah->device = pd->device; - ah->pd = pd; - ah->uobject = NULL; - atomic_inc(&pd->usecnt); - } - - return ah; -} -EXPORT_SYMBOL(ib_create_ah); - -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, - struct ib_grh *grh, struct ib_ah_attr *ah_attr) -{ - u32 flow_class; - u16 gid_index; - int ret; - int is_eth = (rdma_port_get_link_layer(device, port_num) == - IB_LINK_LAYER_ETHERNET); - - memset(ah_attr, 0, sizeof *ah_attr); - if (is_eth) { - if (!(wc->wc_flags & IB_WC_GRH)) - return -EPROTOTYPE; - - if (wc->wc_flags & IB_WC_WITH_SMAC && - wc->wc_flags & IB_WC_WITH_VLAN) { - memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); - ah_attr->vlan_id = wc->vlan_id; - } else { - u32 scope_id = rdma_get_ipv6_scope_id(device, port_num); - ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, - ah_attr->dmac, &ah_attr->vlan_id, - scope_id); - if (ret) - return ret; - } - } else { - ah_attr->vlan_id = 0xffff; - } - - - ah_attr->dlid = wc->slid; - ah_attr->sl = wc->sl; - ah_attr->src_path_bits = wc->dlid_path_bits; - ah_attr->port_num = port_num; - - if (wc->wc_flags & IB_WC_GRH) { - ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = grh->sgid; - - ret = ib_find_cached_gid(device, &grh->dgid, &port_num, - &gid_index); - if (ret) - return ret; - - ah_attr->grh.sgid_index = (u8) gid_index; - flow_class = be32_to_cpu(grh->version_tclass_flow); - ah_attr->grh.flow_label = flow_class & 0xFFFFF; - ah_attr->grh.hop_limit = 0xFF; - ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; - } - return 0; -} -EXPORT_SYMBOL(ib_init_ah_from_wc); - -struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, - struct ib_grh *grh, u8 port_num) -{ - struct ib_ah_attr ah_attr; - int ret; - - ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); - if (ret) - return ERR_PTR(ret); - - return ib_create_ah(pd, &ah_attr); -} -EXPORT_SYMBOL(ib_create_ah_from_wc); - -int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) -{ - return ah->device->modify_ah ? - ah->device->modify_ah(ah, ah_attr) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_modify_ah); - -int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) -{ - return ah->device->query_ah ? - ah->device->query_ah(ah, ah_attr) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_query_ah); - -int ib_destroy_ah(struct ib_ah *ah) -{ - struct ib_pd *pd; - int ret; - - pd = ah->pd; - ret = ah->device->destroy_ah(ah); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_destroy_ah); - -/* Shared receive queues */ - -struct ib_srq *ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *srq_init_attr) -{ - struct ib_srq *srq; - - if (!pd->device->create_srq) - return ERR_PTR(-ENOSYS); - - srq = pd->device->create_srq(pd, srq_init_attr, NULL); - - if (!IS_ERR(srq)) { - srq->device = pd->device; - srq->pd = pd; - srq->uobject = NULL; - srq->event_handler = srq_init_attr->event_handler; - srq->srq_context = srq_init_attr->srq_context; - srq->srq_type = srq_init_attr->srq_type; - if (srq->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; - atomic_inc(&srq->ext.xrc.xrcd->usecnt); - atomic_inc(&srq->ext.xrc.cq->usecnt); - } - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); - } - - return srq; -} -EXPORT_SYMBOL(ib_create_srq); - -int ib_modify_srq(struct ib_srq *srq, - struct ib_srq_attr *srq_attr, - enum ib_srq_attr_mask srq_attr_mask) -{ - return srq->device->modify_srq ? - srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_modify_srq); - -int ib_query_srq(struct ib_srq *srq, - struct ib_srq_attr *srq_attr) -{ - return srq->device->query_srq ? - srq->device->query_srq(srq, srq_attr) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_srq); - -int ib_query_values(struct ib_device *device, - int q_values, struct ib_device_values *values) -{ - return device->query_values ? - device->query_values(device, q_values, values) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_values); - -int ib_destroy_srq(struct ib_srq *srq) -{ - struct ib_pd *pd; - enum ib_srq_type srq_type; - struct ib_xrcd *uninitialized_var(xrcd); - struct ib_cq *uninitialized_var(cq); - int ret; - - if (atomic_read(&srq->usecnt)) - return -EBUSY; - - pd = srq->pd; - srq_type = srq->srq_type; - if (srq_type == IB_SRQT_XRC) { - xrcd = srq->ext.xrc.xrcd; - cq = srq->ext.xrc.cq; - } - - ret = srq->device->destroy_srq(srq); - if (!ret) { - atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) { - atomic_dec(&xrcd->usecnt); - atomic_dec(&cq->usecnt); - } - } - - return ret; -} -EXPORT_SYMBOL(ib_destroy_srq); - -/* Queue pairs */ - -static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) -{ - struct ib_qp *qp = context; - unsigned long flags; - - /* The code below must be synced with deletions of existing qps (ib_close_qp) -- - * because a qp from the list may be closed during the scan, resulting in a kernel Oops. - */ - spin_lock_irqsave(&qp->device->event_handler_lock, flags); - list_for_each_entry(event->element.qp, &qp->open_list, open_list) - if (event->element.qp->event_handler) - event->element.qp->event_handler(event, event->element.qp->qp_context); - spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); -} - -static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) -{ - mutex_lock(&xrcd->tgt_qp_mutex); - list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); - mutex_unlock(&xrcd->tgt_qp_mutex); -} - -static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, - void (*event_handler)(struct ib_event *, void *), - void *qp_context) -{ - struct ib_qp *qp; - unsigned long flags; - - qp = kzalloc(sizeof *qp, GFP_KERNEL); - if (!qp) - return ERR_PTR(-ENOMEM); - - qp->real_qp = real_qp; - atomic_inc(&real_qp->usecnt); - qp->device = real_qp->device; - qp->event_handler = event_handler; - qp->qp_context = qp_context; - qp->qp_num = real_qp->qp_num; - qp->qp_type = real_qp->qp_type; - - spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); - list_add(&qp->open_list, &real_qp->open_list); - spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); - - return qp; -} - -struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, - struct ib_qp_open_attr *qp_open_attr) -{ - struct ib_qp *qp, *real_qp; - - if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) - return ERR_PTR(-EINVAL); - - qp = ERR_PTR(-EINVAL); - mutex_lock(&xrcd->tgt_qp_mutex); - list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { - if (real_qp->qp_num == qp_open_attr->qp_num) { - qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, - qp_open_attr->qp_context); - break; - } - } - mutex_unlock(&xrcd->tgt_qp_mutex); - return qp; -} -EXPORT_SYMBOL(ib_open_qp); - -struct ib_qp *ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *qp_init_attr) -{ - struct ib_qp *qp, *real_qp; - struct ib_device *device; - - device = pd ? pd->device : qp_init_attr->xrcd->device; - qp = device->create_qp(pd, qp_init_attr, NULL); - - if (!IS_ERR(qp)) { - qp->device = device; - qp->real_qp = qp; - qp->uobject = NULL; - qp->qp_type = qp_init_attr->qp_type; - - atomic_set(&qp->usecnt, 0); - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - qp->event_handler = __ib_shared_qp_event_handler; - qp->qp_context = qp; - qp->pd = NULL; - qp->send_cq = qp->recv_cq = NULL; - qp->srq = NULL; - qp->xrcd = qp_init_attr->xrcd; - atomic_inc(&qp_init_attr->xrcd->usecnt); - INIT_LIST_HEAD(&qp->open_list); - - real_qp = qp; - qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, - qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->destroy_qp(real_qp); - } else { - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } - - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; - - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); - } - } - - return qp; -} -EXPORT_SYMBOL(ib_create_qp); - -static const struct { - int valid; - enum ib_qp_attr_mask req_param[IB_QPT_MAX]; - enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; - enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; - enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; -} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_INIT] = { - .valid = 1, - .req_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_QKEY), - [IB_QPT_RAW_PACKET] = IB_QP_PORT, - [IB_QPT_UC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_RC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS | - IB_QP_DC_KEY), - [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - }, - .opt_param = { - [IB_QPT_UD] = IB_QP_GROUP_RSS, - [IB_QPT_RAW_PACKET] = IB_QP_GROUP_RSS - } - }, - }, - [IB_QPS_INIT] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_INIT] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_RC] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - } - }, - [IB_QPS_RTR] = { - .valid = 1, - .req_param = { - [IB_QPT_UC] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN), - [IB_QPT_RC] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_DC_INI] = (IB_QP_PATH_MTU | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_XRC_INI] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN), - [IB_QPT_XRC_TGT] = (IB_QP_AV | - IB_QP_PATH_MTU | - IB_QP_DEST_QPN | - IB_QP_RQ_PSN | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_MIN_RNR_TIMER), - }, - .req_param_add_eth = { - [IB_QPT_RC] = (IB_QP_SMAC), - [IB_QPT_UC] = (IB_QP_SMAC), - [IB_QPT_XRC_INI] = (IB_QP_SMAC), - [IB_QPT_XRC_TGT] = (IB_QP_SMAC) - }, - .opt_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_RC] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_DC_INI] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_RAW_PACKET] = IB_QP_AV, - }, - .opt_param_add_eth = { - [IB_QPT_RC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_UC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID) - } - } - }, - [IB_QPS_RTR] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .req_param = { - [IB_QPT_UD] = IB_QP_SQ_PSN, - [IB_QPT_UC] = IB_QP_SQ_PSN, - [IB_QPT_RC] = (IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_SQ_PSN | - IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_DC_INI] = (IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_SQ_PSN | - IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | - IB_QP_SQ_PSN), - [IB_QPT_SMI] = IB_QP_SQ_PSN, - [IB_QPT_GSI] = IB_QP_SQ_PSN, - }, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - } - }, - [IB_QPS_RTS] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS | - IB_QP_ALT_PATH | - IB_QP_PATH_MIG_STATE | - IB_QP_MIN_RNR_TIMER), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - }, - [IB_QPS_SQD] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ - [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, - [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY - } - }, - }, - [IB_QPS_SQD] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - }, - [IB_QPS_SQD] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_AV | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_PATH_MIG_STATE), - [IB_QPT_RC] = (IB_QP_PORT | - IB_QP_AV | - IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_MAX_QP_RD_ATOMIC | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_INI] = (IB_QP_PORT | - IB_QP_AV | - IB_QP_TIMEOUT | - IB_QP_RETRY_CNT | - IB_QP_RNR_RETRY | - IB_QP_MAX_QP_RD_ATOMIC | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_PATH_MIG_STATE), - [IB_QPT_XRC_TGT] = (IB_QP_PORT | - IB_QP_AV | - IB_QP_TIMEOUT | - IB_QP_MAX_DEST_RD_ATOMIC | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), - [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | - IB_QP_QKEY), - } - } - }, - [IB_QPS_SQE] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, - [IB_QPS_RTS] = { - .valid = 1, - .opt_param = { - [IB_QPT_UD] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_UC] = (IB_QP_CUR_STATE | - IB_QP_ACCESS_FLAGS), - [IB_QPT_SMI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - [IB_QPT_GSI] = (IB_QP_CUR_STATE | - IB_QP_QKEY), - } - } - }, - [IB_QPS_ERR] = { - [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 } - } -}; - -int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) -{ - enum ib_qp_attr_mask req_param, opt_param; - - if (cur_state < 0 || cur_state > IB_QPS_ERR || - next_state < 0 || next_state > IB_QPS_ERR) - return 0; - - if (mask & IB_QP_CUR_STATE && - cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && - cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) - return 0; - - if (!qp_state_table[cur_state][next_state].valid) - return 0; - - req_param = qp_state_table[cur_state][next_state].req_param[type]; - opt_param = qp_state_table[cur_state][next_state].opt_param[type]; - - if (ll == IB_LINK_LAYER_ETHERNET) { - req_param |= qp_state_table[cur_state][next_state]. - req_param_add_eth[type]; - opt_param |= qp_state_table[cur_state][next_state]. - opt_param_add_eth[type]; - } - - if ((mask & req_param) != req_param) - return 0; - - if (mask & ~(req_param | opt_param | IB_QP_STATE)) - return 0; - - return 1; -} -EXPORT_SYMBOL(ib_modify_qp_is_ok); - -int ib_modify_qp(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, - int qp_attr_mask) -{ - int ret; - - ret = qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); - if (!ret && (qp_attr_mask & IB_QP_PORT)) - qp->port_num = qp_attr->port_num; - - return ret; -} -EXPORT_SYMBOL(ib_modify_qp); - -int ib_query_qp(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, - int qp_attr_mask, - struct ib_qp_init_attr *qp_init_attr) -{ - return qp->device->query_qp ? - qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : - -ENOSYS; -} -EXPORT_SYMBOL(ib_query_qp); - -int ib_close_qp(struct ib_qp *qp) -{ - struct ib_qp *real_qp; - unsigned long flags; - - real_qp = qp->real_qp; - if (real_qp == qp) - return -EINVAL; - - spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); - list_del(&qp->open_list); - spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); - - atomic_dec(&real_qp->usecnt); - kfree(qp); - - return 0; -} -EXPORT_SYMBOL(ib_close_qp); - -static int __ib_destroy_shared_qp(struct ib_qp *qp) -{ - struct ib_xrcd *xrcd; - struct ib_qp *real_qp; - int ret; - - real_qp = qp->real_qp; - xrcd = real_qp->xrcd; - - mutex_lock(&xrcd->tgt_qp_mutex); - ib_close_qp(qp); - if (atomic_read(&real_qp->usecnt) == 0) - list_del(&real_qp->xrcd_list); - else - real_qp = NULL; - mutex_unlock(&xrcd->tgt_qp_mutex); - - if (real_qp) { - ret = ib_destroy_qp(real_qp); - if (!ret) - atomic_dec(&xrcd->usecnt); - else - __ib_insert_xrcd_qp(xrcd, real_qp); - } - - return 0; -} - -int ib_destroy_qp(struct ib_qp *qp) -{ - struct ib_pd *pd; - struct ib_cq *scq, *rcq; - struct ib_srq *srq; - int ret; - - if (atomic_read(&qp->usecnt)) - return -EBUSY; - - if (qp->real_qp != qp) - return __ib_destroy_shared_qp(qp); - - pd = qp->pd; - scq = qp->send_cq; - rcq = qp->recv_cq; - srq = qp->srq; - - ret = qp->device->destroy_qp(qp); - if (!ret) { - if (pd) - atomic_dec(&pd->usecnt); - if (scq) - atomic_dec(&scq->usecnt); - if (rcq) - atomic_dec(&rcq->usecnt); - if (srq) - atomic_dec(&srq->usecnt); - } - - return ret; -} -EXPORT_SYMBOL(ib_destroy_qp); - -/* Completion queues */ - -struct ib_cq *ib_create_cq(struct ib_device *device, - ib_comp_handler comp_handler, - void (*event_handler)(struct ib_event *, void *), - void *cq_context, int cqe, int comp_vector) -{ - struct ib_cq *cq; - struct ib_cq_init_attr attr = { - .cqe = cqe, - .comp_vector = comp_vector, - .flags = 0, - }; - - cq = device->create_cq(device, &attr, NULL, NULL); - - if (!IS_ERR(cq)) { - cq->device = device; - cq->uobject = NULL; - cq->comp_handler = comp_handler; - cq->event_handler = event_handler; - cq->cq_context = cq_context; - atomic_set(&cq->usecnt, 0); - } - - return cq; -} -EXPORT_SYMBOL(ib_create_cq); - -int ib_modify_cq(struct ib_cq *cq, - struct ib_cq_attr *cq_attr, - int cq_attr_mask) -{ - return cq->device->modify_cq ? - cq->device->modify_cq(cq, cq_attr, cq_attr_mask) : -ENOSYS; -} -EXPORT_SYMBOL(ib_modify_cq); - -int ib_destroy_cq(struct ib_cq *cq) -{ - if (atomic_read(&cq->usecnt)) - return -EBUSY; - - return cq->device->destroy_cq(cq); -} -EXPORT_SYMBOL(ib_destroy_cq); - -int ib_resize_cq(struct ib_cq *cq, int cqe) -{ - return cq->device->resize_cq ? - cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS; -} -EXPORT_SYMBOL(ib_resize_cq); - -/* Memory regions */ - -struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) -{ - struct ib_mr *mr; - int err; - - err = ib_check_mr_access(mr_access_flags); - if (err) - return ERR_PTR(err); - - mr = pd->device->get_dma_mr(pd, mr_access_flags); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_get_dma_mr); - -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_mr *mr; - int err; - - err = ib_check_mr_access(mr_access_flags); - if (err) - return ERR_PTR(err); - - if (!pd->device->reg_phys_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_reg_phys_mr); - -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_pd *old_pd; - int ret; - - ret = ib_check_mr_access(mr_access_flags); - if (ret) - return ret; - - if (!mr->device->rereg_phys_mr) - return -ENOSYS; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - old_pd = mr->pd; - - ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd, - phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) { - atomic_dec(&old_pd->usecnt); - atomic_inc(&pd->usecnt); - } - - return ret; -} -EXPORT_SYMBOL(ib_rereg_phys_mr); - -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) -{ - return mr->device->query_mr ? - mr->device->query_mr(mr, mr_attr) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_mr); - -int ib_dereg_mr(struct ib_mr *mr) -{ - struct ib_pd *pd; - int ret; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; - ret = mr->device->dereg_mr(mr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dereg_mr); - -struct ib_mr *ib_create_mr(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr) -{ - struct ib_mr *mr; - - if (!pd->device->create_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->create_mr(pd, mr_init_attr); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_create_mr); - -int ib_destroy_mr(struct ib_mr *mr) -{ - struct ib_pd *pd; - int ret; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; - ret = mr->device->destroy_mr(mr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_destroy_mr); - -struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) -{ - struct ib_mr *mr; - - if (!pd->device->alloc_fast_reg_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_alloc_fast_reg_mr); - -struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device, - int max_page_list_len) -{ - struct ib_fast_reg_page_list *page_list; - - if (!device->alloc_fast_reg_page_list) - return ERR_PTR(-ENOSYS); - - page_list = device->alloc_fast_reg_page_list(device, max_page_list_len); - - if (!IS_ERR(page_list)) { - page_list->device = device; - page_list->max_page_list_len = max_page_list_len; - } - - return page_list; -} -EXPORT_SYMBOL(ib_alloc_fast_reg_page_list); - -void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) -{ - page_list->device->free_fast_reg_page_list(page_list); -} -EXPORT_SYMBOL(ib_free_fast_reg_page_list); - -/* Memory windows */ - -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) -{ - struct ib_mw *mw; - - if (!pd->device->alloc_mw) - return ERR_PTR(-ENOSYS); - - mw = pd->device->alloc_mw(pd, type); - if (!IS_ERR(mw)) { - mw->device = pd->device; - mw->pd = pd; - mw->uobject = NULL; - mw->type = type; - atomic_inc(&pd->usecnt); - } - - return mw; -} -EXPORT_SYMBOL(ib_alloc_mw); - -int ib_dealloc_mw(struct ib_mw *mw) -{ - struct ib_pd *pd; - int ret; - - pd = mw->pd; - ret = mw->device->dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_mw); - -/* "Fast" memory regions */ - -struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, - int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct ib_fmr *fmr; - - if (!pd->device->alloc_fmr) - return ERR_PTR(-ENOSYS); - - fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); - if (!IS_ERR(fmr)) { - fmr->device = pd->device; - fmr->pd = pd; - atomic_inc(&pd->usecnt); - } - - return fmr; -} -EXPORT_SYMBOL(ib_alloc_fmr); - -int ib_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *fmr; - - if (list_empty(fmr_list)) - return 0; - - fmr = list_entry(fmr_list->next, struct ib_fmr, list); - return fmr->device->unmap_fmr(fmr_list); -} -EXPORT_SYMBOL(ib_unmap_fmr); - -int ib_dealloc_fmr(struct ib_fmr *fmr) -{ - struct ib_pd *pd; - int ret; - - pd = fmr->pd; - ret = fmr->device->dealloc_fmr(fmr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_fmr); - -/* Multicast groups */ - -int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) -{ - int ret; - - if (!qp->device->attach_mcast) - return -ENOSYS; - - switch (rdma_node_get_transport(qp->device->node_type)) { - case RDMA_TRANSPORT_IB: - if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && - qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: - if (qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - } - - ret = qp->device->attach_mcast(qp, gid, lid); - if (!ret) - atomic_inc(&qp->usecnt); - return ret; -} -EXPORT_SYMBOL(ib_attach_mcast); - -int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) -{ - int ret; - - if (!qp->device->detach_mcast) - return -ENOSYS; - - switch (rdma_node_get_transport(qp->device->node_type)) { - case RDMA_TRANSPORT_IB: - if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && - qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: - - if (qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - } - - ret = qp->device->detach_mcast(qp, gid, lid); - if (!ret) - atomic_dec(&qp->usecnt); - return ret; -} -EXPORT_SYMBOL(ib_detach_mcast); - -struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) -{ - struct ib_xrcd *xrcd; - - if (!device->alloc_xrcd) - return ERR_PTR(-ENOSYS); - - xrcd = device->alloc_xrcd(device, NULL, NULL); - if (!IS_ERR(xrcd)) { - xrcd->device = device; - xrcd->inode = NULL; - atomic_set(&xrcd->usecnt, 0); - mutex_init(&xrcd->tgt_qp_mutex); - INIT_LIST_HEAD(&xrcd->tgt_qp_list); - } - - return xrcd; -} -EXPORT_SYMBOL(ib_alloc_xrcd); - -int ib_dealloc_xrcd(struct ib_xrcd *xrcd) -{ - struct ib_qp *qp; - int ret; - - if (atomic_read(&xrcd->usecnt)) - return -EBUSY; - - while (!list_empty(&xrcd->tgt_qp_list)) { - qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); - ret = ib_destroy_qp(qp); - if (ret) - return ret; - } - - return xrcd->device->dealloc_xrcd(xrcd); -} -EXPORT_SYMBOL(ib_dealloc_xrcd); - -struct ib_flow *ib_create_flow(struct ib_qp *qp, - struct ib_flow_attr *flow_attr, - int domain) -{ - struct ib_flow *flow_id; - if (!qp->device->create_flow) - return ERR_PTR(-ENOSYS); - - flow_id = qp->device->create_flow(qp, flow_attr, domain); - if (!IS_ERR(flow_id)) - atomic_inc(&qp->usecnt); - return flow_id; -} -EXPORT_SYMBOL(ib_create_flow); - -int ib_destroy_flow(struct ib_flow *flow_id) -{ - int err; - struct ib_qp *qp; - - if (!flow_id) - return -EINVAL; - qp = flow_id->qp; - if (!qp->device->destroy_flow) - return -ENOSYS; - err = qp->device->destroy_flow(flow_id); - if (!err) - atomic_dec(&qp->usecnt); - return err; -} -EXPORT_SYMBOL(ib_destroy_flow); - -struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr, - struct ib_udata *udata) -{ - struct ib_dct *dct; - - if (!pd->device->exp_create_dct) - return ERR_PTR(-ENOSYS); - - dct = pd->device->exp_create_dct(pd, attr, udata); - if (!IS_ERR(dct)) { - dct->pd = pd; - dct->srq = attr->srq; - dct->cq = attr->cq; - atomic_inc(&dct->srq->usecnt); - atomic_inc(&dct->cq->usecnt); - atomic_inc(&dct->pd->usecnt); - } - - return dct; -} -EXPORT_SYMBOL(ib_create_dct); - -int ib_destroy_dct(struct ib_dct *dct) -{ - int err; - - if (!dct->device->exp_destroy_dct) - return -ENOSYS; - - err = dct->device->exp_destroy_dct(dct); - if (!err) { - atomic_dec(&dct->srq->usecnt); - atomic_dec(&dct->cq->usecnt); - atomic_dec(&dct->pd->usecnt); - } - - return err; -} -EXPORT_SYMBOL(ib_destroy_dct); - -int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr) -{ - if (!dct->device->exp_query_dct) - return -ENOSYS; - - return dct->device->exp_query_dct(dct, attr); -} -EXPORT_SYMBOL(ib_query_dct); - -int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, - struct ib_mr_status *mr_status) -{ - return mr->device->check_mr_status ? - mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; -} -EXPORT_SYMBOL(ib_check_mr_status); Index: sys/ofed/drivers/infiniband/debug/Makefile =================================================================== --- sys/ofed/drivers/infiniband/debug/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -EXTRA_CFLAGS := $(subst $(KERNEL_MEMTRACK_CFLAGS),,$(EXTRA_CFLAGS)) - -obj-m += memtrack.o Index: sys/ofed/drivers/infiniband/debug/memtrack.h =================================================================== --- sys/ofed/drivers/infiniband/debug/memtrack.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - This software is available to you under a choice of one of two - licenses. You may choose to be licensed under the terms of the GNU - General Public License (GPL) Version 2, available at - , or the OpenIB.org BSD - license, available in the LICENSE.TXT file accompanying this - software. These details are also available at - . - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. -*/ - -#ifndef H_MEMTRACK_H -#define H_MEMTRACK_H - -enum memtrack_memtype_t { - MEMTRACK_KMALLOC, - MEMTRACK_VMALLOC, - MEMTRACK_KMEM_OBJ, - MEMTRACK_IOREMAP, /* IO-RE/UN-MAP */ - MEMTRACK_WORK_QUEUE, /* Handle work-queue create & destroy */ - MEMTRACK_PAGE_ALLOC, /* Handle page allocation and free */ - MEMTRACK_DMA_MAP_SINGLE,/* Handle ib_dma_single map and unmap */ - MEMTRACK_DMA_MAP_PAGE, /* Handle ib_dma_page map and unmap */ - MEMTRACK_DMA_MAP_SG, /* Handle ib_dma_sg map and unmap with and without attributes */ - MEMTRACK_NUM_OF_MEMTYPES -}; - -/* Invoke on memory allocation */ -void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, unsigned long addr2, - int direction, const char *filename, - const unsigned long line_num, int alloc_flags); - -/* Invoke on memory free */ -void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, int direction, - const char *filename, const unsigned long line_num); - -/* - * This function recognizes allocations which - * may be released by kernel (e.g. skb & vnic) and - * therefore not trackable by memtrack. - * The allocations are recognized by the name - * of their calling function. - */ -int is_non_trackable_alloc_func(const char *func_name); -/* - * In some cases we need to free a memory - * we defined as "non trackable" (see - * is_non_trackable_alloc_func). - * This function recognizes such releases - * by the name of their calling function. - */ -int is_non_trackable_free_func(const char *func_name); - -/* WA - In this function handles confirm - the function name is - '__ib_umem_release' or 'ib_umem_get' - In this case we won't track the - memory there because the kernel - was the one who allocated it. - Return value: - 1 - if the function name is match, else 0 */ -int is_umem_put_page(const char *func_name); - -/* Check page order size - When Freeing a page allocation it checks whether - we are trying to free the same amount of pages - we ask to allocate (In log2(order)). - In case an error if found it will print - an error msg */ -int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, - unsigned long size, const char *filename, - const unsigned long line_num); - -/* Search for a specific addr whether it exist in the - current data-base. - If not it will print an error msg, - Return value: 0 - if addr exist, else 1 */ -int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, - const char *filename, const unsigned long line_num); - -/* Return current page reference counter */ -int memtrack_get_page_ref_count(unsigned long addr); - -/* Report current allocations status (for all memory types) */ -/* we do not export this function since it is used by cleanup_module only */ -/* void memtrack_report(void); */ - -/* Allow support of error injections */ -int memtrack_inject_error(void); - -/* randomize allocated memory */ -int memtrack_randomize_mem(void); - -#endif Index: sys/ofed/drivers/infiniband/debug/memtrack.c =================================================================== --- sys/ofed/drivers/infiniband/debug/memtrack.c +++ /dev/null @@ -1,960 +0,0 @@ -/* - This software is available to you under a choice of one of two - licenses. You may choose to be licensed under the terms of the GNU - General Public License (GPL) Version 2, available at - , or the OpenIB.org BSD - license, available in the LICENSE.TXT file accompanying this - software. These details are also available at - . - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. -*/ - -#define LINUXKPI_PARAM_PREFIX memtrack_ - -#define C_MEMTRACK_C - -#ifdef kmalloc - #undef kmalloc -#endif -#ifdef kmemdup - #undef kmemdup -#endif -#ifdef kfree - #undef kfree -#endif -#ifdef vmalloc - #undef vmalloc -#endif -#ifdef vzalloc - #undef vzalloc -#endif -#ifdef vzalloc_node - #undef vzalloc_node -#endif -#ifdef vfree - #undef vfree -#endif -#ifdef kmem_cache_alloc - #undef kmem_cache_alloc -#endif -#ifdef kmem_cache_free - #undef kmem_cache_free -#endif -#ifdef ioremap - #undef ioremap -#endif -#ifdef io_mapping_create_wc - #undef io_mapping_create_wc -#endif -#ifdef io_mapping_free - #undef io_mapping_free -#endif -#ifdef ioremap_nocache - #undef ioremap_nocache -#endif -#ifdef iounmap - #undef iounmap -#endif -#ifdef alloc_pages - #undef alloc_pages -#endif -#ifdef free_pages - #undef free_pages -#endif -#ifdef get_page - #undef get_page -#endif -#ifdef put_page - #undef put_page -#endif -#ifdef create_workqueue - #undef create_workqueue -#endif -#ifdef create_rt_workqueue - #undef create_rt_workqueue -#endif -#ifdef create_freezeable_workqueue - #undef create_freezeable_workqueue -#endif -#ifdef create_singlethread_workqueue - #undef create_singlethread_workqueue -#endif -#ifdef destroy_workqueue - #undef destroy_workqueue -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "memtrack.h" - -#include - - -MODULE_AUTHOR("Mellanox Technologies LTD."); -MODULE_DESCRIPTION("Memory allocations tracking"); -MODULE_LICENSE("GPL"); - -#define MEMTRACK_HASH_SZ ((1<<15)-19) /* prime: http://www.utm.edu/research/primes/lists/2small/0bit.html */ -#define MAX_FILENAME_LEN 31 - -#define memtrack_spin_lock(spl, flags) spin_lock_irqsave(spl, flags) -#define memtrack_spin_unlock(spl, flags) spin_unlock_irqrestore(spl, flags) - -/* if a bit is set then the corresponding allocation is tracked. - bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */ -static unsigned long track_mask = -1; /* effectively everything */ -module_param(track_mask, ulong, 0444); -MODULE_PARM_DESC(track_mask, "bitmask defining what is tracked"); - -/* if a bit is set then the corresponding allocation is strictly tracked. - That is, before inserting the whole range is checked to not overlap any - of the allocations already in the database */ -static unsigned long strict_track_mask = 0; /* no strict tracking */ -module_param(strict_track_mask, ulong, 0444); -MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking"); - -/* Sets the frequency of allocations failures injections - if set to 0 all allocation should succeed */ -static unsigned int inject_freq = 0; -module_param(inject_freq, uint, 0644); -MODULE_PARM_DESC(inject_freq, "Error injection frequency, default is 0 (disabled)"); - -static int random_mem = 1; -module_param(random_mem, uint, 0644); -MODULE_PARM_DESC(random_mem, "When set, randomize allocated memory, default is 1 (enabled)"); - -struct memtrack_meminfo_t { - unsigned long addr; - unsigned long size; - unsigned long line_num; - unsigned long dev; - unsigned long addr2; - int direction; - struct memtrack_meminfo_t *next; - struct list_head list; /* used to link all items from a certain type together */ - char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */ - char ext_info[32]; -}; - -static struct kmem_cache *meminfo_cache; - -struct tracked_obj_desc_t { - struct memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ]; - spinlock_t hash_lock; - unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */ - struct list_head tracked_objs_head; /* head of list of all objects */ - int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */ -}; - -static struct tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES]; - -static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = { - "kmalloc", - "vmalloc", - "kmem_cache_alloc", - "io_remap", - "create_workqueue", - "alloc_pages", - "ib_dma_map_single", - "ib_dma_map_page", - "ib_dma_map_sg" -}; - -static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = { - "kfree", - "vfree", - "kmem_cache_free", - "io_unmap", - "destory_workqueue", - "free_pages", - "ib_dma_unmap_single", - "ib_dma_unmap_page", - "ib_dma_unmap_sg" -}; - -static inline const char *memtype_alloc_str(enum memtrack_memtype_t memtype) -{ - switch (memtype) { - case MEMTRACK_KMALLOC: - case MEMTRACK_VMALLOC: - case MEMTRACK_KMEM_OBJ: - case MEMTRACK_IOREMAP: - case MEMTRACK_WORK_QUEUE: - case MEMTRACK_PAGE_ALLOC: - case MEMTRACK_DMA_MAP_SINGLE: - case MEMTRACK_DMA_MAP_PAGE: - case MEMTRACK_DMA_MAP_SG: - return rsc_names[memtype]; - default: - return "(Unknown allocation type)"; - } -} - -static inline const char *memtype_free_str(enum memtrack_memtype_t memtype) -{ - switch (memtype) { - case MEMTRACK_KMALLOC: - case MEMTRACK_VMALLOC: - case MEMTRACK_KMEM_OBJ: - case MEMTRACK_IOREMAP: - case MEMTRACK_WORK_QUEUE: - case MEMTRACK_PAGE_ALLOC: - case MEMTRACK_DMA_MAP_SINGLE: - case MEMTRACK_DMA_MAP_PAGE: - case MEMTRACK_DMA_MAP_SG: - return rsc_free_names[memtype]; - default: - return "(Unknown allocation type)"; - } -} - -/* - * overlap_a_b - */ -static inline int overlap_a_b(unsigned long a_start, unsigned long a_end, - unsigned long b_start, unsigned long b_end) -{ - if ((b_start > a_end) || (a_start > b_end)) - return 0; - - return 1; -} - -/* - * check_overlap - */ -static void check_overlap(enum memtrack_memtype_t memtype, - struct memtrack_meminfo_t *mem_info_p, - struct tracked_obj_desc_t *obj_desc_p) -{ - struct list_head *pos, *next; - struct memtrack_meminfo_t *cur; - unsigned long start_a, end_a, start_b, end_b; - - start_a = mem_info_p->addr; - end_a = mem_info_p->addr + mem_info_p->size - 1; - - list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) { - cur = list_entry(pos, struct memtrack_meminfo_t, list); - - start_b = cur->addr; - end_b = cur->addr + cur->size - 1; - - if (overlap_a_b(start_a, end_a, start_b, end_b)) - printk(KERN_ERR "%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n", - memtype_alloc_str(memtype), mem_info_p->addr, - mem_info_p->addr + mem_info_p->size - 1, cur->addr, - cur->addr + cur->size - 1); - } -} - -/* Invoke on memory allocation */ -void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, unsigned long addr2, - int direction, const char *filename, - const unsigned long line_num, int alloc_flags) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - new_mem_info_p = (struct memtrack_meminfo_t *)kmem_cache_alloc(meminfo_cache, alloc_flags); - if (new_mem_info_p == NULL) { - printk(KERN_ERR "%s: Failed allocating kmem_cache item for new mem_info. " - "Lost tracking on allocation at %s:%lu...\n", __func__, - filename, line_num); - return; - } - /* save allocation properties */ - new_mem_info_p->addr = addr; - new_mem_info_p->size = size; - new_mem_info_p->dev = dev; - new_mem_info_p->addr2 = addr2; - new_mem_info_p->direction = direction; - - new_mem_info_p->line_num = line_num; - *new_mem_info_p->ext_info = '\0'; - /* Make sure that we will print out the path tail if the given filename is longer - * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file - * in the printout -- only the path head! - */ - if (strlen(filename) > MAX_FILENAME_LEN) - strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN); - else - strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN); - - new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */ - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* make sure given memory location is not already allocated */ - if ((memtype != MEMTRACK_DMA_MAP_SINGLE) && (memtype != MEMTRACK_DMA_MAP_PAGE) && - (memtype != MEMTRACK_DMA_MAP_SG)) { - - /* make sure given memory location is not already allocated */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { - /* Found given address in the database */ - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n", - __func__, filename, line_num, - memtype_alloc_str(memtype), addr, - cur_mem_info_p->filename, - cur_mem_info_p->line_num); - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - kmem_cache_free(meminfo_cache, new_mem_info_p); - return; - } - cur_mem_info_p = cur_mem_info_p->next; - } - } - /* not found - we can put in the hash bucket */ - /* link as first */ - new_mem_info_p->next = obj_desc_p->mem_hash[hash_val]; - obj_desc_p->mem_hash[hash_val] = new_mem_info_p; - if (obj_desc_p->strict_track) - check_overlap(memtype, new_mem_info_p, obj_desc_p); - obj_desc_p->count += size; - list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head); - - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return; -} -EXPORT_SYMBOL(memtrack_alloc); - -/* Invoke on memory free */ -void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, - unsigned long addr, unsigned long size, int direction, - const char *filename, const unsigned long line_num) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - prev_mem_info_p = NULL; - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { - /* Found given address in the database */ - if ((memtype == MEMTRACK_DMA_MAP_SINGLE) || (memtype == MEMTRACK_DMA_MAP_PAGE) || - (memtype == MEMTRACK_DMA_MAP_SG)) { - if (direction != cur_mem_info_p->direction) - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad direction for addr 0x%lX: alloc:0x%x, free:0x%x (allocated in %s::%lu)\n", - __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->direction, direction, - cur_mem_info_p->filename, cur_mem_info_p->line_num); - - if (size != cur_mem_info_p->size) - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad size for addr 0x%lX: size:%lu, free:%lu (allocated in %s::%lu)\n", - __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->size, size, - cur_mem_info_p->filename, cur_mem_info_p->line_num); - } - - /* Remove from the bucket/list */ - if (prev_mem_info_p == NULL) - obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */ - else - prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */ - - list_del(&cur_mem_info_p->list); - - obj_desc_p->count -= cur_mem_info_p->size; - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - kmem_cache_free(meminfo_cache, cur_mem_info_p); - return; - } - prev_mem_info_p = cur_mem_info_p; - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found */ - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX, device=0x%lX\n", - __func__, filename, line_num, memtype_free_str(memtype), addr, dev); - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return; -} -EXPORT_SYMBOL(memtrack_free); - -/* - * This function recognizes allocations which - * may be released by kernel (e.g. skb) and - * therefore not trackable by memtrack. - * The allocations are recognized by the name - * of their calling function. - */ -int is_non_trackable_alloc_func(const char *func_name) -{ - static const char * const str_str_arr[] = { - /* functions containing these strings consider non trackable */ - "skb", - }; - static const char * const str_str_excep_arr[] = { - /* functions which are exception to the str_str_arr table */ - "ipoib_cm_skb_too_long" - }; - static const char * const str_cmp_arr[] = { - /* functions that allocate SKBs */ - "mlx4_en_alloc_frags", - "mlx4_en_alloc_frag", - "mlx4_en_init_allocator", - "mlx4_en_free_frag", - "mlx4_en_free_rx_desc", - "mlx4_en_destroy_allocator", - "mlx4_en_complete_rx_desc", - /* vnic skb functions */ - "free_single_frag", - "vnic_alloc_rx_skb", - "vnic_rx_skb", - "vnic_alloc_frag", - "vnic_empty_rx_entry", - "vnic_init_allocator", - "vnic_destroy_allocator", - "sdp_post_recv", - "sdp_rx_ring_purge", - "sdp_post_srcavail", - "sk_stream_alloc_page", - "update_send_head", - "sdp_bcopy_get", - "sdp_destroy_resources", - - /* function that allocate memory for RDMA device context */ - "ib_alloc_device" - }; - size_t str_str_arr_size = sizeof(str_str_arr)/sizeof(char *); - size_t str_str_excep_size = sizeof(str_str_excep_arr)/sizeof(char *); - size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); - - int i, j; - - for (i = 0; i < str_str_arr_size; ++i) - if (strstr(func_name, str_str_arr[i])) { - for (j = 0; j < str_str_excep_size; ++j) - if (!strcmp(func_name, str_str_excep_arr[j])) - return 0; - return 1; - } - for (i = 0; i < str_cmp_arr_size; ++i) - if (!strcmp(func_name, str_cmp_arr[i])) - return 1; - return 0; -} -EXPORT_SYMBOL(is_non_trackable_alloc_func); - -/* - * In some cases we need to free a memory - * we defined as "non trackable" (see - * is_non_trackable_alloc_func). - * This function recognizes such releases - * by the name of their calling function. - */ -int is_non_trackable_free_func(const char *func_name) -{ - - static const char * const str_cmp_arr[] = { - /* function that deallocate memory for RDMA device context */ - "ib_dealloc_device" - }; - size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); - - int i; - - for (i = 0; i < str_cmp_arr_size; ++i) - if (!strcmp(func_name, str_cmp_arr[i])) - return 1; - return 0; -} -EXPORT_SYMBOL(is_non_trackable_free_func); - - -/* WA - In this function handles confirm - the function name is - '__ib_umem_release' or 'ib_umem_get' - In this case we won't track the - memory there because the kernel - was the one who allocated it. - Return value: - 1 - if the function name is match, else 0 */ -int is_umem_put_page(const char *func_name) -{ - const char func_str[18] = "__ib_umem_release"; - /* In case of error flow put_page is called as part of ib_umem_get */ - const char func_str1[12] = "ib_umem_get"; - - return ((strstr(func_name, func_str) != NULL) || - (strstr(func_name, func_str1) != NULL)) ? 1 : 0; -} -EXPORT_SYMBOL(is_umem_put_page); - -/* Check page order size - When Freeing a page allocation it checks whether - we are trying to free the same size - we asked to allocate */ -int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, - unsigned long size, const char *filename, - const unsigned long line_num) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - int ret = 0; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return 1; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return 1; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { - /* Found given address in the database - check size */ - if (cur_mem_info_p->size != size) { - printk(KERN_ERR "mtl size inconsistency: %s: %s::%lu: try to %s at address=0x%lX with size %lu while was created with size %lu\n", - __func__, filename, line_num, memtype_free_str(memtype), - addr, size, cur_mem_info_p->size); - snprintf(cur_mem_info_p->ext_info, sizeof(cur_mem_info_p->ext_info), - "invalid free size %lu\n", size); - ret = 1; - } - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return ret; - } - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found - This function will not give any indication - but will only check the correct size\order - For inconsistency the 'free' function will check that */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return 1; -} -EXPORT_SYMBOL(memtrack_check_size); - -/* Search for a specific addr whether it exist in the - current data-base. - It will print an error msg if we get an unexpected result, - Return value: 0 - if addr exist, else 1 */ -int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, - const char *filename, const unsigned long line_num) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); - return 1; - } - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return 0; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { - /* Found given address in the database - exiting */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return 0; - } - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found */ - if (expect_exist) - printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n", - __func__, filename, line_num, memtype_free_str(memtype), addr); - - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return 1; -} -EXPORT_SYMBOL(memtrack_is_new_addr); - -/* Return current page reference counter */ -int memtrack_get_page_ref_count(unsigned long addr) -{ - unsigned long hash_val; - struct memtrack_meminfo_t *cur_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - /* This function is called only for page allocation */ - enum memtrack_memtype_t memtype = MEMTRACK_PAGE_ALLOC; - int ref_conut = 0; - - if (!tracked_objs_arr[memtype]) { - /* object is not tracked */ - return ref_conut; - } - obj_desc_p = tracked_objs_arr[memtype]; - - hash_val = addr % MEMTRACK_HASH_SZ; - - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); - /* find mem_info of given memory location */ - cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; - while (cur_mem_info_p != NULL) { - if (cur_mem_info_p->addr == addr) { - /* Found given address in the database - check ref-count */ - struct page *page = (struct page *)(cur_mem_info_p->addr); - ref_conut = atomic_read(&page->_count); - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return ref_conut; - } - cur_mem_info_p = cur_mem_info_p->next; - } - - /* not found */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - return ref_conut; -} -EXPORT_SYMBOL(memtrack_get_page_ref_count); - -/* Report current allocations status (for all memory types) */ -static void memtrack_report(void) -{ - enum memtrack_memtype_t memtype; - unsigned long cur_bucket; - struct memtrack_meminfo_t *cur_mem_info_p; - int serial = 1; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - unsigned long detected_leaks = 0; - - printk(KERN_INFO "%s: Currently known allocations:\n", __func__); - for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { - if (tracked_objs_arr[memtype]) { - printk(KERN_INFO "%d) %s:\n", serial, memtype_alloc_str(memtype)); - obj_desc_p = tracked_objs_arr[memtype]; - /* Scan all buckets to find existing allocations */ - /* TBD: this may be optimized by holding a linked list of all hash items */ - for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ - cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; - while (cur_mem_info_p != NULL) { /* scan bucket */ - printk(KERN_INFO "%s::%lu: %s(%lu)==%lX dev=%lX %s\n", - cur_mem_info_p->filename, - cur_mem_info_p->line_num, - memtype_alloc_str(memtype), - cur_mem_info_p->size, - cur_mem_info_p->addr, - cur_mem_info_p->dev, - cur_mem_info_p->ext_info); - cur_mem_info_p = cur_mem_info_p->next; - ++ detected_leaks; - } /* while cur_mem_info_p */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - } /* for cur_bucket */ - serial++; - } - } /* for memtype */ - printk(KERN_INFO "%s: Summary: %lu leak(s) detected\n", __func__, detected_leaks); -} - - - -static struct proc_dir_entry *memtrack_tree; - -static enum memtrack_memtype_t get_rsc_by_name(const char *name) -{ - enum memtrack_memtype_t i; - - for (i = 0; i < MEMTRACK_NUM_OF_MEMTYPES; ++i) { - if (strcmp(name, rsc_names[i]) == 0) - return i; - } - - return i; -} - - -static ssize_t memtrack_read(struct file *filp, - char __user *buf, - size_t size, - loff_t *offset) -{ - unsigned long cur, flags; - loff_t pos = *offset; - static char kbuf[20]; - static int file_len; - int _read, to_ret, left; - const char *fname; - enum memtrack_memtype_t memtype; - - if (pos < 0) - return -EINVAL; - - fname = filp->f_dentry->d_name.name; - - memtype = get_rsc_by_name(fname); - if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { - printk(KERN_ERR "invalid file name\n"); - return -EINVAL; - } - - if (pos == 0) { - memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags); - cur = tracked_objs_arr[memtype]->count; - memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags); - _read = sprintf(kbuf, "%lu\n", cur); - if (_read < 0) - return _read; - else - file_len = _read; - } - - left = file_len - pos; - to_ret = (left < size) ? left : size; - if (copy_to_user(buf, kbuf+pos, to_ret)) - return -EFAULT; - else { - *offset = pos + to_ret; - return to_ret; - } -} - -static const struct file_operations memtrack_proc_fops = { - .read = memtrack_read, -}; - -static const char *memtrack_proc_entry_name = "mt_memtrack"; - -static int create_procfs_tree(void) -{ - struct proc_dir_entry *dir_ent; - struct proc_dir_entry *proc_ent; - int i, j; - unsigned long bit_mask; - - dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL); - if (!dir_ent) - return -1; - - memtrack_tree = dir_ent; - - for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { - if (bit_mask & track_mask) { - proc_ent = create_proc_entry(rsc_names[i], S_IRUGO, memtrack_tree); - if (!proc_ent) - goto undo_create_root; - - proc_ent->proc_fops = &memtrack_proc_fops; - } - } - - goto exit_ok; - -undo_create_root: - for (j = 0, bit_mask = 1; j < i; ++j, bit_mask <<= 1) { - if (bit_mask & track_mask) - remove_proc_entry(rsc_names[j], memtrack_tree); - } - remove_proc_entry(memtrack_proc_entry_name, NULL); - return -1; - -exit_ok: - return 0; -} - - -static void destroy_procfs_tree(void) -{ - int i; - unsigned long bit_mask; - - for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { - if (bit_mask & track_mask) - remove_proc_entry(rsc_names[i], memtrack_tree); - - } - remove_proc_entry(memtrack_proc_entry_name, NULL); -} - -int memtrack_inject_error(void) -{ - int val = 0; - - if (inject_freq) { - if (!(random32() % inject_freq)) - val = 1; - } - - return val; -} -EXPORT_SYMBOL(memtrack_inject_error); - -int memtrack_randomize_mem(void) -{ - return random_mem; -} -EXPORT_SYMBOL(memtrack_randomize_mem); - -/* module entry points */ - -int init_module(void) -{ - enum memtrack_memtype_t i; - int j; - unsigned long bit_mask; - - - /* create a cache for the memtrack_meminfo_t strcutures */ - meminfo_cache = kmem_cache_create("memtrack_meminfo_t", - sizeof(struct memtrack_meminfo_t), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!meminfo_cache) { - printk(KERN_ERR "memtrack::%s: failed to allocate meminfo cache\n", __func__); - return -1; - } - - /* initialize array of descriptors */ - memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr)); - - /* create a tracking object descriptor for all required objects */ - for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { - if (bit_mask & track_mask) { - tracked_objs_arr[i] = vmalloc(sizeof(struct tracked_obj_desc_t)); - if (!tracked_objs_arr[i]) { - printk(KERN_ERR "memtrack: failed to allocate tracking object\n"); - goto undo_cache_create; - } - - memset(tracked_objs_arr[i], 0, sizeof(struct tracked_obj_desc_t)); - spin_lock_init(&tracked_objs_arr[i]->hash_lock); - INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head); - if (bit_mask & strict_track_mask) - tracked_objs_arr[i]->strict_track = 1; - else - tracked_objs_arr[i]->strict_track = 0; - } - } - - - if (create_procfs_tree()) { - printk(KERN_ERR "%s: create_procfs_tree() failed\n", __FILE__); - goto undo_cache_create; - } - - printk(KERN_INFO "memtrack::%s done.\n", __func__); - - return 0; - -undo_cache_create: - for (j = 0; j < i; ++j) { - if (tracked_objs_arr[j]) - vfree(tracked_objs_arr[j]); - } - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) - if (kmem_cache_destroy(meminfo_cache) != 0) - printk(KERN_ERR "Failed on kmem_cache_destroy!\n"); -#else - kmem_cache_destroy(meminfo_cache); -#endif - return -1; -} - - -void cleanup_module(void) -{ - enum memtrack_memtype_t memtype; - unsigned long cur_bucket; - struct memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p; - struct tracked_obj_desc_t *obj_desc_p; - unsigned long flags; - - - memtrack_report(); - - - destroy_procfs_tree(); - - /* clean up any hash table left-overs */ - for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { - /* Scan all buckets to find existing allocations */ - /* TBD: this may be optimized by holding a linked list of all hash items */ - if (tracked_objs_arr[memtype]) { - obj_desc_p = tracked_objs_arr[memtype]; - for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { - memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ - cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; - while (cur_mem_info_p != NULL) { /* scan bucket */ - next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */ - kmem_cache_free(meminfo_cache, cur_mem_info_p); - cur_mem_info_p = next_mem_info_p; - } /* while cur_mem_info_p */ - memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); - } /* for cur_bucket */ - vfree(obj_desc_p); - } - } /* for memtype */ - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) - if (kmem_cache_destroy(meminfo_cache) != 0) - printk(KERN_ERR "memtrack::cleanup_module: Failed on kmem_cache_destroy!\n"); -#else - kmem_cache_destroy(meminfo_cache); -#endif - printk(KERN_INFO "memtrack::cleanup_module done.\n"); -} Index: sys/ofed/drivers/infiniband/debug/mtrack.h =================================================================== --- sys/ofed/drivers/infiniband/debug/mtrack.h +++ /dev/null @@ -1,844 +0,0 @@ -#ifndef __mtrack_h_ -#define __mtrack_h_ - -#include "memtrack.h" - -#include -#include -#include -#include /* For ioremap_nocache, ioremap, iounmap */ -#include -#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 27) -# include /* For ioremap_nocache, ioremap, iounmap */ -#endif -#include /* For all page handling */ -#include /* For all work-queue handling */ -#include /* For using scatterlists */ -#include /* For skbufs handling */ -#include /* For copy from/to user */ - -#define MEMTRACK_ERROR_INJECTION_MESSAGE(file, line, func) ({ \ - printk(KERN_ERR "%s failure injected at %s:%d\n", func, file, line); \ - dump_stack(); \ -}) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 14) -#define RDMA_KZALLOC_H -#define kzalloc(size, flags) ({ \ - void *__memtrack_kz_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\ - else \ - __memtrack_kz_addr = kmalloc(size, flags); \ - if (__memtrack_kz_addr && !is_non_trackable_alloc_func(__func__)) { \ - memset(__memtrack_kz_addr, 0, size); \ - } \ - __memtrack_kz_addr; \ -}) - -#else -#define kzalloc(size, flags) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\ - else \ - __memtrack_addr = kzalloc(size, flags); \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) - -#endif - -#define kzalloc_node(size, flags, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc_node"); \ - else \ - __memtrack_addr = kzalloc_node(size, flags, node); \ - if (__memtrack_addr && (size) && \ - !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) -#define kcalloc(n, size, flags) kzalloc((n)*(size), flags) -#else -#define kcalloc(n, size, flags) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kcalloc");\ - else \ - __memtrack_addr = kcalloc(n, size, flags); \ - if (__memtrack_addr && (size)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) -#endif - - - -#ifdef ZERO_OR_NULL_PTR -#define kmalloc(sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\ - else \ - __memtrack_addr = kmalloc(sz, flgs); \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, sz); \ - } \ - __memtrack_addr; \ -}) -#else -#define kmalloc(sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\ - else \ - __memtrack_addr = kmalloc(sz, flgs); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, sz); \ - } \ - __memtrack_addr; \ -}) - -#endif - -#define kmalloc_node(sz, flgs, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc_node"); \ - else \ - __memtrack_addr = kmalloc_node(sz, flgs, node); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \ - get_random_bytes(__memtrack_addr, sz); \ - } \ - __memtrack_addr; \ -}) - -#ifdef ZERO_OR_NULL_PTR -#define kmemdup(src, sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\ - else \ - __memtrack_addr = kmemdup(src, sz, flgs); \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - } \ - __memtrack_addr; \ -}) -#else -#define kmemdup(src, sz, flgs) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\ - else \ - __memtrack_addr = kmemdup(src, sz, flgs); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ - } \ - __memtrack_addr; \ -}) -#endif - -#ifdef ZERO_OR_NULL_PTR -#define kfree(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \ - !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - kfree(__memtrack_addr); \ -}) -#else -#define kfree(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - kfree(__memtrack_addr); \ -}) -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) || defined (CONFIG_COMPAT_RCU) -#ifdef kfree_rcu - #undef kfree_rcu -#endif - -#ifdef ZERO_OR_NULL_PTR -#define kfree_rcu(addr, rcu_head) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \ - !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ -}) -#else -#define kfree_rcu(addr, rcu_head) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \ - memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ -}) -#endif -#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) */ - -#define vmalloc(size) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc");\ - else \ - __memtrack_addr = vmalloc(size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, size); \ - } \ - __memtrack_addr; \ -}) - -#ifndef vzalloc -#define vzalloc(size) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc");\ - else \ - __memtrack_addr = vzalloc(size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#endif - -#ifndef vzalloc_node -#define vzalloc_node(size, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc_node"); \ - else \ - __memtrack_addr = vzalloc_node(size, node); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, size); \ - } \ - __memtrack_addr; \ -}) -#endif - -#define vmalloc_node(size, node) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc_node"); \ - else \ - __memtrack_addr = vmalloc_node(size, node); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - if (memtrack_randomize_mem()) \ - get_random_bytes(__memtrack_addr, size); \ - } \ - __memtrack_addr; \ -}) - -#define vfree(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - vfree(__memtrack_addr); \ -}) - - -#define kmem_cache_alloc(cache, flags) ({ \ - void *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmem_cache_alloc"); \ - else \ - __memtrack_addr = kmem_cache_alloc(cache, flags); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 1, 0UL, 0, __FILE__, __LINE__, flags); \ - } \ - __memtrack_addr; \ -}) - - -#define kmem_cache_free(cache, addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - kmem_cache_free(cache, __memtrack_addr); \ -}) - - -/* All IO-MAP handling */ -#define ioremap(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap");\ - else \ - __memtrack_addr = ioremap(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) - -#define io_mapping_create_wc(base, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "io_mapping_create_wc"); \ - else \ - __memtrack_addr = io_mapping_create_wc(base, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) - -#define io_mapping_free(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - io_mapping_free(__memtrack_addr); \ -}) - -#ifdef CONFIG_PPC -#ifdef ioremap_nocache - #undef ioremap_nocache -#endif -#define ioremap_nocache(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ - else \ - __memtrack_addr = ioremap(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#else -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 18) /* 2.6.16 - 2.6.17 */ -#ifdef ioremap_nocache - #undef ioremap_nocache -#endif -#define ioremap_nocache(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ - else \ - __memtrack_addr = ioremap(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#else -#define ioremap_nocache(phys_addr, size) ({ \ - void __iomem *__memtrack_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \ - else \ - __memtrack_addr = ioremap_nocache(phys_addr, size); \ - if (__memtrack_addr) { \ - memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_addr; \ -}) -#endif /* Kernel version is under 2.6.18 */ -#endif /* PPC */ - -#define iounmap(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - iounmap(__memtrack_addr); \ -}) - - -/* All Page handlers */ -/* TODO: Catch netif_rx for page dereference */ -#define alloc_pages_node(nid, gfp_mask, order) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages_node"); \ - else \ - page_addr = (struct page *)alloc_pages_node(nid, gfp_mask, order); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - page_addr; \ -}) - -#ifdef CONFIG_NUMA -#define alloc_pages(gfp_mask, order) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages"); \ - else \ - page_addr = (struct page *)alloc_pages(gfp_mask, order); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - page_addr; \ -}) -#else -#ifdef alloc_pages - #undef alloc_pages -#endif -#define alloc_pages(gfp_mask, order) ({ \ - struct page *page_addr; \ - \ - page_addr = (struct page *)alloc_pages_node(numa_node_id(), gfp_mask, order); \ - page_addr; \ -}) -#endif - -#define __get_free_pages(gfp_mask, order) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "__get_free_pages"); \ - else \ - page_addr = (struct page *)__get_free_pages(gfp_mask, order); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - page_addr; \ -}) - -#define get_zeroed_page(gfp_mask) ({ \ - struct page *page_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_zeroed_page"); \ - else \ - page_addr = (struct page *)get_zeroed_page(gfp_mask); \ - if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - (unsigned long)page_addr; \ -}) - -#define __free_pages(addr, order) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ - memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - __free_pages(addr, order); \ -}) - - -#define free_pages(addr, order) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ - memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - free_pages(addr, order); \ -}) - - -#define get_page(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - if (memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 0, __FILE__, __LINE__)) { \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - } \ - get_page(addr); \ -}) - -#define get_user_pages_fast(start, nr_pages, write, pages) ({ \ - int __memtrack_rc = -1; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_user_pages_fast"); \ - else \ - __memtrack_rc = get_user_pages_fast(start, nr_pages, write, pages); \ - if (__memtrack_rc > 0 && !is_non_trackable_alloc_func(__func__)) { \ - int __memtrack_i; \ - \ - for (__memtrack_i = 0; __memtrack_i < __memtrack_rc; __memtrack_i++) \ - memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(pages[__memtrack_i]), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - __memtrack_rc; \ -}) - -#define put_page(addr) ({ \ - void *__memtrack_addr = (void *)addr; \ - \ - if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ - /* Check whether this is not part of umem put page & not */\ - /* a new addr and the ref-count is 1 then we'll free this addr */\ - /* Don't change the order these conditions */ \ - if (!is_umem_put_page(__func__) && \ - !memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__) && \ - (memtrack_get_page_ref_count((unsigned long)(__memtrack_addr)) == 1)) { \ - memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - } \ - put_page(addr); \ -}) - - -/* Work-Queue handlers */ -#ifdef create_workqueue - #undef create_workqueue -#endif -#ifdef create_rt_workqueue - #undef create_rt_workqueue -#endif -#ifdef create_freezeable_workqueue - #undef create_freezeable_workqueue -#endif -#ifdef create_singlethread_workqueue - #undef create_singlethread_workqueue -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20) /* 2.6.18 - 2.6.19 */ -#define create_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_singlethread_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) /* 2.6.20 - 2.6.27 */ -#define create_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 22) /* 2.6.20 - 2.6.21 */ -#define create_freezeable_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#else /* 2.6.22 - 2.6.27 */ -#define create_freezeable_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#endif /* 2.6.20 - 2.6.27 */ - -#define create_singlethread_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36) /* 2.6.28 - 2.6.35 */ - -#ifdef alloc_workqueue - #undef alloc_workqueue -#endif - -#define alloc_workqueue(name, flags, max_active) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), (flags), (max_active), 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 0, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_rt_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_rt_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 0, 0, 1); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_freezeable_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 1, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) - -#define create_singlethread_workqueue(name) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \ - else \ - wq_addr = __create_workqueue((name), 1, 0, 0); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#else /* 2.6.36 */ -#ifdef alloc_workqueue - #undef alloc_workqueue -#endif -#ifdef CONFIG_LOCKDEP -#define alloc_workqueue(name, flags, max_active) \ -({ \ - static struct lock_class_key __key; \ - const char *__lock_name; \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (__builtin_constant_p(name)) \ - __lock_name = (name); \ - else \ - __lock_name = #name; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ - else \ - wq_addr = __alloc_workqueue_key((name), (flags), (max_active), \ - &__key, __lock_name); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#else -#define alloc_workqueue(name, flags, max_active) ({ \ - struct workqueue_struct *wq_addr = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \ - else \ - wq_addr = __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL); \ - if (wq_addr) { \ - memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ - } \ - wq_addr; \ -}) -#endif - -#define create_workqueue(name) \ - alloc_workqueue((name), WQ_RESCUER, 1); - -#define create_freezeable_workqueue(name) \ - alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1); - -#define create_singlethread_workqueue(name) \ - alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1); - -#endif /* Work-Queue Kernel Versions */ - -#define destroy_workqueue(wq_addr) ({ \ - void *__memtrack_addr = (void *)wq_addr; \ - \ - if (__memtrack_addr) { \ - memtrack_free(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ - } \ - destroy_workqueue(wq_addr); \ -}) - -/* ONLY error injection to functions that we don't monitor */ -#define alloc_skb(size, prio) ({ \ - struct sk_buff *__memtrack_skb = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb"); \ - else \ - __memtrack_skb = alloc_skb(size, prio); \ - __memtrack_skb; \ -}) - -#define dev_alloc_skb(size) ({ \ - struct sk_buff *__memtrack_skb = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "dev_alloc_skb"); \ - else \ - __memtrack_skb = dev_alloc_skb(size); \ - __memtrack_skb; \ -}) - -#define alloc_skb_fclone(size, prio) ({ \ - struct sk_buff *__memtrack_skb = NULL; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb_fclone"); \ - else \ - __memtrack_skb = alloc_skb_fclone(size, prio); \ - __memtrack_skb; \ -}) - -#define copy_from_user(to, from, n) ({ \ - int ret = n; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_from_user"); \ - else \ - ret = copy_from_user(to, from, n); \ - ret; \ -}) - -#define copy_to_user(to, from, n) ({ \ - int ret = n; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_to_user"); \ - else \ - ret = copy_to_user(to, from, n); \ - ret; \ -}) - -#define sysfs_create_file(kobj, attr) ({ \ - int ret = -ENOSYS; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_file"); \ - else \ - ret = sysfs_create_file(kobj, attr); \ - ret; \ -}) - -#define sysfs_create_link(kobj, target, name) ({ \ - int ret = -ENOSYS; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_link"); \ - else \ - ret = sysfs_create_link(kobj, target, name); \ - ret; \ -}) - -#define sysfs_create_group(kobj, grp) ({ \ - int ret = -ENOSYS; \ - \ - if (memtrack_inject_error()) \ - MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_group"); \ - else \ - ret = sysfs_create_group(kobj, grp); \ - ret; \ -}) - -#endif /* __mtrack_h_ */ - Index: sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig +++ /dev/null @@ -1,50 +0,0 @@ -config INFINIBAND_IPOIB - tristate "IP-over-InfiniBand" - depends on NETDEVICES && INET && (IPV6 || IPV6=n) - select INET_LRO - ---help--- - Support for the IP-over-InfiniBand protocol (IPoIB). This - transports IP packets over InfiniBand so you can use your IB - device as a fancy NIC. - - See Documentation/infiniband/ipoib.txt for more information - -config INFINIBAND_IPOIB_CM - bool "IP-over-InfiniBand Connected Mode support" - depends on INFINIBAND_IPOIB - default n - ---help--- - This option enables support for IPoIB connected mode. After - enabling this option, you need to switch to connected mode - through /sys/class/net/ibXXX/mode to actually create - connections, and then increase the interface MTU with - e.g. ifconfig ib0 mtu 65520. - - WARNING: Enabling connected mode will trigger some packet - drops for multicast and UD mode traffic from this interface, - unless you limit mtu for these destinations to 2044. - -config INFINIBAND_IPOIB_DEBUG - bool "IP-over-InfiniBand debugging" if EMBEDDED - depends on INFINIBAND_IPOIB - default y - ---help--- - This option causes debugging code to be compiled into the - IPoIB driver. The output can be turned on via the - debug_level and mcast_debug_level module parameters (which - can also be set after the driver is loaded through sysfs). - - This option also creates a directory tree under ipoib/ in - debugfs, which contains files that expose debugging - information about IB multicast groups used by the IPoIB - driver. - -config INFINIBAND_IPOIB_DEBUG_DATA - bool "IP-over-InfiniBand data path debugging" - depends on INFINIBAND_IPOIB_DEBUG - ---help--- - This option compiles debugging code into the data path - of the IPoIB driver. The output can be turned on via the - data_debug_level module parameter; however, even with output - turned off, this debugging code will have some performance - impact. Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h @@ -325,6 +325,7 @@ unsigned long flags; int gone; + int unit; struct mutex vlan_mutex; @@ -349,7 +350,6 @@ u16 pkey; u16 pkey_index; struct ib_pd *pd; - struct ib_mr *mr; struct ib_cq *recv_cq; struct ib_cq *send_cq; struct ib_qp *qp; @@ -368,7 +368,7 @@ unsigned tx_head; unsigned tx_tail; struct ib_sge tx_sge[IPOIB_MAX_TX_SG]; - struct ib_send_wr tx_wr; + struct ib_ud_wr tx_wr; unsigned tx_outstanding; struct ib_wc send_wc[MAX_SEND_CQE]; Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -256,7 +256,7 @@ eh = mtod(mb, struct ipoib_header *); bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */ - if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) + if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID; dev->if_input(dev, mb); @@ -451,21 +451,20 @@ priv->tx_sge[i].addr = mapping[i]; priv->tx_sge[i].length = m->m_len; } - priv->tx_wr.num_sge = i; - priv->tx_wr.wr_id = wr_id; - priv->tx_wr.wr.ud.remote_qpn = qpn; - priv->tx_wr.wr.ud.ah = address; - + priv->tx_wr.wr.num_sge = i; + priv->tx_wr.wr.wr_id = wr_id; + priv->tx_wr.remote_qpn = qpn; + priv->tx_wr.ah = address; if (head) { - priv->tx_wr.wr.ud.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ - priv->tx_wr.wr.ud.header = head; - priv->tx_wr.wr.ud.hlen = hlen; - priv->tx_wr.opcode = IB_WR_LSO; + priv->tx_wr.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ + priv->tx_wr.header = head; + priv->tx_wr.hlen = hlen; + priv->tx_wr.wr.opcode = IB_WR_LSO; } else - priv->tx_wr.opcode = IB_WR_SEND; + priv->tx_wr.wr.opcode = IB_WR_SEND; - return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); + return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr); } void @@ -524,9 +523,9 @@ } if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) - priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; + priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM; else - priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -84,7 +84,7 @@ struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); -static void ipoib_remove_one(struct ib_device *device); +static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); @@ -99,6 +99,31 @@ } \ } while (0) +static struct unrhdr *ipoib_unrhdr; + +static void +ipoib_unrhdr_init(void *arg) +{ + + ipoib_unrhdr = new_unrhdr(0, 65535, NULL); +} +SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL); + +static void +ipoib_unrhdr_uninit(void *arg) +{ + + if (ipoib_unrhdr != NULL) { + struct unrhdr *hdr; + + hdr = ipoib_unrhdr; + ipoib_unrhdr = NULL; + + delete_unrhdr(hdr); + } +} +SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL); + /* * This is for clients that have an ipoib_header in the mbuf. */ @@ -808,6 +833,7 @@ bpfdetach(dev); if_detach(dev); if_free(dev); + free_unr(ipoib_unrhdr, priv->unit); } else VLAN_SETCOOKIE(priv->dev, NULL); @@ -834,8 +860,6 @@ priv->tx_ring = NULL; } -static volatile int ipoib_unit; - static struct ipoib_dev_priv * ipoib_priv_alloc(void) { @@ -876,7 +900,13 @@ return NULL; } dev->if_softc = priv; - if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1)); + priv->unit = alloc_unr(ipoib_unrhdr); + if (priv->unit == -1) { + if_free(dev); + free(priv, M_TEMP); + return NULL; + } + if_initname(dev, name, priv->unit); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; @@ -903,27 +933,10 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { - struct ib_device_attr *device_attr; - int result = -ENOMEM; - - device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); - if (!device_attr) { - printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", - hca->name, sizeof *device_attr); - return result; - } + struct ib_device_attr *device_attr = &hca->attrs; - result = ib_query_device(hca, device_attr); - if (result) { - printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", - hca->name, result); - kfree(device_attr); - return result; - } priv->hca_caps = device_attr->device_cap_flags; - kfree(device_attr); - priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; @@ -991,7 +1004,7 @@ priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; - result = ib_query_gid(hca, port, 0, &priv->local_gid); + result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); @@ -1070,15 +1083,16 @@ } static void -ipoib_remove_one(struct ib_device *device) +ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp; - struct list_head *dev_list; + struct list_head *dev_list = client_data; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + if (!dev_list) return; - dev_list = ib_get_client_data(device, &ipoib_client); + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -167,7 +167,7 @@ } priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); - priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + priv->tx_wr.remote_qkey = priv->qkey; set_qkey = 1; } @@ -480,7 +480,7 @@ return; } - if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -137,23 +137,18 @@ .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_UD }; + struct ib_cq_init_attr cq_attr = {}; int ret, size; int i; /* XXX struct ethtool_coalesce *coal; */ - priv->pd = ib_alloc_pd(priv->ca); + priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); return -ENODEV; } - priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(priv->mr)) { - printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); - goto out_free_pd; - } - size = ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(priv); if (!ret) { @@ -164,14 +159,16 @@ size += ipoib_recvq_size * ipoib_max_conn_qp; } - priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, size, 0); + cq_attr.cqe = size; + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, &cq_attr); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); goto out_free_mr; } + cq_attr.cqe = ipoib_sendq_size; priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, - priv, ipoib_sendq_size, 0); + priv, &cq_attr); if (IS_ERR(priv->send_cq)) { printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); goto out_free_recv_cq; @@ -215,14 +212,14 @@ IF_LLADDR(priv->dev)[3] = (priv->qp->qp_num ) & 0xff; for (i = 0; i < IPOIB_MAX_TX_SG; ++i) - priv->tx_sge[i].lkey = priv->mr->lkey; + priv->tx_sge[i].lkey = priv->pd->local_dma_lkey; - priv->tx_wr.opcode = IB_WR_SEND; - priv->tx_wr.sg_list = priv->tx_sge; - priv->tx_wr.send_flags = IB_SEND_SIGNALED; + priv->tx_wr.wr.opcode = IB_WR_SEND; + priv->tx_wr.wr.sg_list = priv->tx_sge; + priv->tx_wr.wr.send_flags = IB_SEND_SIGNALED; for (i = 0; i < IPOIB_UD_RX_SG; ++i) - priv->rx_sge[i].lkey = priv->mr->lkey; + priv->rx_sge[i].lkey = priv->pd->local_dma_lkey; priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; @@ -235,10 +232,8 @@ ib_destroy_cq(priv->recv_cq); out_free_mr: - ib_dereg_mr(priv->mr); ipoib_cm_dev_cleanup(priv); -out_free_pd: ib_dealloc_pd(priv->pd); return -ENODEV; } @@ -262,11 +257,7 @@ ipoib_cm_dev_cleanup(priv); - if (ib_dereg_mr(priv->mr)) - ipoib_warn(priv, "ib_dereg_mr failed\n"); - - if (ib_dealloc_pd(priv->pd)) - ipoib_warn(priv, "ib_dealloc_pd failed\n"); + ib_dealloc_pd(priv->pd); } void ipoib_event(struct ib_event_handler *handler, Index: sys/ofed/include/rdma/ib.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/ib.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(_RDMA_IB_H) +#define _RDMA_IB_H + +#include +#include +#include + +/* + * Define a native infiniband address as in Linux upstream + * 8d36eb01da5d371feffa280e501377b5c450f5a5 + */ +#define AF_IB 41 + +struct ib_addr { + union { + __u8 uib_addr8[16]; + __be16 uib_addr16[8]; + __be32 uib_addr32[4]; + __be64 uib_addr64[2]; + } ib_u; +#define sib_addr8 ib_u.uib_addr8 +#define sib_addr16 ib_u.uib_addr16 +#define sib_addr32 ib_u.uib_addr32 +#define sib_addr64 ib_u.uib_addr64 +#define sib_raw ib_u.uib_addr8 +#define sib_subnet_prefix ib_u.uib_addr64[0] +#define sib_interface_id ib_u.uib_addr64[1] +}; + +static inline int ib_addr_any(const struct ib_addr *a) +{ + return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0); +} + +static inline int ib_addr_loopback(const struct ib_addr *a) +{ + return ((a->sib_addr32[0] | a->sib_addr32[1] | + a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0); +} + +static inline void ib_addr_set(struct ib_addr *addr, + __be32 w1, __be32 w2, __be32 w3, __be32 w4) +{ + addr->sib_addr32[0] = w1; + addr->sib_addr32[1] = w2; + addr->sib_addr32[2] = w3; + addr->sib_addr32[3] = w4; +} + +static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2) +{ + return memcmp(a1, a2, sizeof(struct ib_addr)); +} + +struct sockaddr_ib { + unsigned short int sib_family; /* AF_IB */ + __be16 sib_pkey; + __be32 sib_flowinfo; + struct ib_addr sib_addr; + __be64 sib_sid; + __be64 sib_sid_mask; + __u64 sib_scope_id; +}; + +/* + * The IB interfaces that use write() as bi-directional ioctl() are + * fundamentally unsafe, since there are lots of ways to trigger "write()" + * calls from various contexts with elevated privileges. That includes the + * traditional suid executable error message writes, but also various kernel + * interfaces that can write to file descriptors. + * + * This function provides protection for the legacy API by restricting the + * calling context. + */ +static inline bool ib_safe_file_access(struct file *filp) +{ + struct thread *td = curthread; + + /* + * Check if called from userspace through a devfs related + * system call belonging to the given file: + */ + return (filp->_file != NULL && + filp->_file == td->td_fpop && + filp->_file->f_cred == td->td_ucred); +} + +#endif /* _RDMA_IB_H */ Index: sys/ofed/include/rdma/ib_addr.h =================================================================== --- sys/ofed/include/rdma/ib_addr.h +++ sys/ofed/include/rdma/ib_addr.h @@ -31,7 +31,7 @@ * SOFTWARE. */ -#ifndef IB_ADDR_H +#if !defined(IB_ADDR_H) #define IB_ADDR_H #include @@ -41,9 +41,11 @@ #include #include #include +#include +#include +#include #include #include -#include #include struct rdma_addr_client { @@ -62,6 +64,17 @@ */ void rdma_addr_unregister_client(struct rdma_addr_client *client); +/** + * struct rdma_dev_addr - Contains resolved RDMA hardware addresses + * @src_dev_addr: Source MAC address. + * @dst_dev_addr: Destination MAC address. + * @broadcast: Broadcast address of the device. + * @dev_type: The interface hardware type of the device. + * @bound_dev_if: An optional device interface index. + * @transport: The transport type used. + * @net: Network namespace containing the bound_dev_if net_dev. + */ +struct vnet; struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; @@ -69,14 +82,19 @@ unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; + struct vnet *net; + enum rdma_network_type network; + int hoplimit; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. + * + * The dev_addr->net field must be initialized. */ -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, - u16 *vlan_id); +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, u16 *vlan_id); /** * rdma_resolve_ip - Resolve source and destination IP addresses to @@ -88,7 +106,7 @@ * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has - * been invoked. + * been invoked. The net field of the addr struct must be valid. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. @@ -101,20 +119,22 @@ struct rdma_dev_addr *addr, void *context), void *context); +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr); + void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, const unsigned char *dst_dev_addr); -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id, - u32 scope_id); -int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac, - u16 *vlan_id, u32 scope_id); -static inline int ip_addr_size(struct sockaddr *addr) -{ - return addr->sa_family == AF_INET6 ? - sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); -} +int rdma_addr_size(struct sockaddr *addr); + +int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id); +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *smac, u16 *vlan_id, int *if_index, + int *hoplimit); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { @@ -147,16 +167,16 @@ return tag; } -static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) +static inline int rdma_ip2gid(const struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: - ipv6_addr_set_v4mapped(((struct sockaddr_in *)addr)->sin_addr.s_addr, + ipv6_addr_set_v4mapped(((const struct sockaddr_in *) + addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: - memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, - 16); + memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16); break; default: return -EINVAL; @@ -165,10 +185,9 @@ } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ -static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid, - uint32_t scope_id) +static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) { - if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { + if (ipv6_addr_v4mapped((const struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_len = sizeof(*out_in); @@ -180,21 +199,28 @@ out_in->sin6_len = sizeof(*out_in); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); - if (scope_id < 256 && - IN6_IS_SCOPE_LINKLOCAL(&out_in->sin6_addr)) - out_in->sin6_scope_id = scope_id; } - return 0; } -u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num); - -/* This func is called only in loopback ip address (127.0.0.1) - * case in which sgid is not relevant - */ static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { + struct net_device *dev; + struct ifaddr *ifa; + + dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (dev) { + TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET) + continue; + ipv6_addr_set_v4mapped(((struct sockaddr_in *) + ifa->ifa_addr)->sin_addr.s_addr, + (struct in6_addr *)gid); + break; + } + dev_put(dev); + } } static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) @@ -246,13 +272,19 @@ static inline int iboe_get_rate(struct net_device *dev) { - if (dev->if_baudrate >= IF_Gbps(40)) + uint64_t baudrate = dev->if_baudrate; +#ifdef if_baudrate_pf + int exp; + for (exp = dev->if_baudrate_pf; exp > 0; exp--) + baudrate *= 10; +#endif + if (baudrate >= IF_Gbps(40)) return IB_RATE_40_GBPS; - else if (dev->if_baudrate >= IF_Gbps(30)) + else if (baudrate >= IF_Gbps(30)) return IB_RATE_30_GBPS; - else if (dev->if_baudrate >= IF_Gbps(20)) + else if (baudrate >= IF_Gbps(20)) return IB_RATE_20_GBPS; - else if (dev->if_baudrate >= IF_Gbps(10)) + else if (baudrate >= IF_Gbps(10)) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; @@ -279,20 +311,6 @@ return addr->s6_addr[0] == 0xff; } -static inline void resolve_mcast_mac(struct in6_addr *addr, u8 *mac) -{ - if (addr->s6_addr[0] != 0xff) - return; - -#ifdef DUAL_MODE_MCAST_MAC - if (addr->s6_addr[1] == 0x0e) /* IPv4 */ - ip_eth_mc_map(addr->s6_addr32[3], mac); - else -#endif - ipv6_eth_mc_map(addr, mac); -} - - static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; @@ -308,7 +326,7 @@ u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; - return vid < 0x1000 ? vid : 0xffff; + return vid < 0x1000 ? vid : 0xffff; } static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) Index: sys/ofed/include/rdma/ib_cache.h =================================================================== --- sys/ofed/include/rdma/ib_cache.h +++ sys/ofed/include/rdma/ib_cache.h @@ -43,6 +43,8 @@ * @port_num: The port number of the device to query. * @index: The index into the cached GID table to query. * @gid: The GID value found at the specified index. + * @attr: The GID attribute found at the specified index (only in RoCE). + * NULL means ignore (output parameter). * * ib_get_cached_gid() fetches the specified GID table entry stored in * the local software cache. @@ -50,13 +52,16 @@ int ib_get_cached_gid(struct ib_device *device, u8 port_num, int index, - union ib_gid *gid); + union ib_gid *gid, + struct ib_gid_attr *attr); /** * ib_find_cached_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @ndev: In RoCE, the net device of the device. NULL means ignore. * @port_num: The port number of the device where the GID value was found. * @index: The index into the cached GID table where the GID was found. This * parameter may be NULL. @@ -65,11 +70,42 @@ * the local software cache. */ int ib_find_cached_gid(struct ib_device *device, - union ib_gid *gid, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev, u8 *port_num, u16 *index); /** + * ib_find_cached_gid_by_port - Returns the GID table index where a specified + * GID value occurs + * @device: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @port_num: The port number of the device where the GID value sould be + * searched. + * @ndev: In RoCE, the net device of the device. Null means ignore. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_find_cached_gid() searches for the specified GID value in + * the local software cache. + */ +int ib_find_cached_gid_by_port(struct ib_device *device, + const union ib_gid *gid, + enum ib_gid_type gid_type, + u8 port_num, + struct net_device *ndev, + u16 *index); + +int ib_find_gid_by_filter(struct ib_device *device, + const union ib_gid *gid, + u8 port_num, + bool (*filter)(const union ib_gid *gid, + const struct ib_gid_attr *, + void *), + void *context, u16 *index); +/** * ib_get_cached_pkey - Returns a cached PKey table entry * @device: The device to query. * @port_num: The port number of the device to query. Index: sys/ofed/include/rdma/ib_cm.h =================================================================== --- sys/ofed/include/rdma/ib_cm.h +++ sys/ofed/include/rdma/ib_cm.h @@ -105,13 +105,16 @@ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216, IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136, IB_CM_SIDR_REP_INFO_LENGTH = 72, - IB_CM_COMPARE_SIZE = 64 }; struct ib_cm_id; struct ib_cm_req_event_param { struct ib_cm_id *listen_id; + + /* P_Key that was used by the GMP's BTH header */ + u16 bth_pkey; + u8 port; struct ib_sa_path_rec *primary_path; @@ -222,6 +225,9 @@ struct ib_cm_sidr_req_event_param { struct ib_cm_id *listen_id; + __be64 service_id; + /* P_Key that was used by the GMP's BTH header */ + u16 bth_pkey; u8 port; u16 pkey; }; @@ -336,11 +342,6 @@ #define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL) #define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL) -struct ib_cm_compare_data { - u8 data[IB_CM_COMPARE_SIZE]; - u8 mask[IB_CM_COMPARE_SIZE]; -}; - /** * ib_cm_listen - Initiates listening on the specified service ID for * connection and service ID resolution requests. @@ -353,12 +354,13 @@ * range of service IDs. If set to 0, the service ID is matched * exactly. This parameter is ignored if %service_id is set to * IB_CM_ASSIGN_SERVICE_ID. - * @compare_data: This parameter is optional. It specifies data that must - * appear in the private data of a connection request for the specified - * listen request. */ -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, - struct ib_cm_compare_data *compare_data); +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, + __be64 service_mask); + +struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, + ib_cm_handler cm_handler, + __be64 service_id); struct ib_cm_req_param { struct ib_sa_path_rec *primary_path; @@ -601,6 +603,4 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_param *param); -int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac); - #endif /* IB_CM_H */ Index: sys/ofed/include/rdma/ib_hdrs.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/ib_hdrs.h @@ -0,0 +1,178 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef IB_HDRS_H +#define IB_HDRS_H + +#include +#include +#include + +#define IB_SEQ_NAK (3 << 29) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +#define IB_BTH_REQ_ACK BIT(31) +#define IB_BTH_SOLICITED BIT(23) +#define IB_BTH_MIG_REQ BIT(22) + +#define IB_GRH_VERSION 6 +#define IB_GRH_VERSION_MASK 0xF +#define IB_GRH_VERSION_SHIFT 28 +#define IB_GRH_TCLASS_MASK 0xFF +#define IB_GRH_TCLASS_SHIFT 20 +#define IB_GRH_FLOW_MASK 0xFFFFF +#define IB_GRH_FLOW_SHIFT 0 +#define IB_GRH_NEXT_HDR 0x1B + +struct ib_reth { + __be64 vaddr; /* potentially unaligned */ + __be32 rkey; + __be32 length; +} __packed; + +struct ib_atomic_eth { + __be64 vaddr; /* potentially unaligned */ + __be32 rkey; + __be64 swap_data; /* potentially unaligned */ + __be64 compare_data; /* potentially unaligned */ +} __packed; + +union ib_ehdrs { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be64 atomic_ack_eth; /* potentially unaligned */ + } __packed at; + __be32 imm_data; + __be32 aeth; + __be32 ieth; + struct ib_atomic_eth atomic_eth; +} __packed; + +struct ib_other_headers { + __be32 bth[3]; + union ib_ehdrs u; +} __packed; + +struct ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ib_other_headers oth; + } l; + struct ib_other_headers oth; + } u; +} __packed; + +/* accessors for unaligned __be64 items */ + +static inline u64 ib_u64_get(__be64 *p) +{ + return get_unaligned_be64(p); +} + +static inline void ib_u64_put(u64 val, __be64 *p) +{ + put_unaligned_be64(val, p); +} + +static inline u64 get_ib_reth_vaddr(struct ib_reth *reth) +{ + return ib_u64_get(&reth->vaddr); +} + +static inline void put_ib_reth_vaddr(u64 val, struct ib_reth *reth) +{ + ib_u64_put(val, &reth->vaddr); +} + +static inline u64 get_ib_ateth_vaddr(struct ib_atomic_eth *ateth) +{ + return ib_u64_get(&ateth->vaddr); +} + +static inline void put_ib_ateth_vaddr(u64 val, struct ib_atomic_eth *ateth) +{ + ib_u64_put(val, &ateth->vaddr); +} + +static inline u64 get_ib_ateth_swap(struct ib_atomic_eth *ateth) +{ + return ib_u64_get(&ateth->swap_data); +} + +static inline void put_ib_ateth_swap(u64 val, struct ib_atomic_eth *ateth) +{ + ib_u64_put(val, &ateth->swap_data); +} + +static inline u64 get_ib_ateth_compare(struct ib_atomic_eth *ateth) +{ + return ib_u64_get(&ateth->compare_data); +} + +static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth) +{ + ib_u64_put(val, &ateth->compare_data); +} + +#endif /* IB_HDRS_H */ Index: sys/ofed/include/rdma/ib_mad.h =================================================================== --- sys/ofed/include/rdma/ib_mad.h +++ sys/ofed/include/rdma/ib_mad.h @@ -40,9 +40,13 @@ #include #include +#include -/* Management base version */ +/* Management base versions */ #define IB_MGMT_BASE_VERSION 1 +#define OPA_MGMT_BASE_VERSION 0x80 + +#define OPA_SMP_CLASS_VERSION 0x80 /* Management classes */ #define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01 @@ -123,6 +127,23 @@ #define IB_DEFAULT_PKEY_PARTIAL 0x7FFF #define IB_DEFAULT_PKEY_FULL 0xFFFF +/* + * Generic trap/notice types + */ +#define IB_NOTICE_TYPE_FATAL 0x80 +#define IB_NOTICE_TYPE_URGENT 0x81 +#define IB_NOTICE_TYPE_SECURITY 0x82 +#define IB_NOTICE_TYPE_SM 0x83 +#define IB_NOTICE_TYPE_INFO 0x84 + +/* + * Generic trap/notice producers + */ +#define IB_NOTICE_PROD_CA cpu_to_be16(1) +#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2) +#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3) +#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4) + enum { IB_MGMT_MAD_HDR = 24, IB_MGMT_MAD_DATA = 232, @@ -134,6 +155,10 @@ IB_MGMT_SA_DATA = 200, IB_MGMT_DEVICE_HDR = 64, IB_MGMT_DEVICE_DATA = 192, + IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA, + OPA_MGMT_MAD_DATA = 2024, + OPA_MGMT_RMPP_DATA = 2012, + OPA_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + OPA_MGMT_MAD_DATA, }; struct ib_mad_hdr { @@ -180,12 +205,23 @@ u8 data[IB_MGMT_MAD_DATA]; }; +struct opa_mad { + struct ib_mad_hdr mad_hdr; + u8 data[OPA_MGMT_MAD_DATA]; +}; + struct ib_rmpp_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; u8 data[IB_MGMT_RMPP_DATA]; }; +struct opa_rmpp_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 data[OPA_MGMT_RMPP_DATA]; +}; + struct ib_sa_mad { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; @@ -201,12 +237,17 @@ u8 data[IB_MGMT_VENDOR_DATA]; }; +#define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001) + +#define IB_CLASS_PORT_INFO_RESP_TIME_MASK 0x1F +#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5 + struct ib_class_port_info { u8 base_version; u8 class_version; __be16 capability_mask; - u8 reserved[3]; - u8 resp_time_value; + /* 27 bits for cap_mask2, 5 bits for resp_time */ + __be32 cap_mask2_resp_time; u8 redirect_gid[16]; __be32 redirect_tcslfl; __be16 redirect_lid; @@ -222,6 +263,123 @@ }; /** + * ib_get_cpi_resp_time - Returns the resp_time value from + * cap_mask2_resp_time in ib_class_port_info. + * @cpi: A struct ib_class_port_info mad. + */ +static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi) +{ + return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) & + IB_CLASS_PORT_INFO_RESP_TIME_MASK); +} + +/** + * ib_set_cpi_resptime - Sets the response time in an + * ib_class_port_info mad. + * @cpi: A struct ib_class_port_info. + * @rtime: The response time to set. + */ +static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi, + u8 rtime) +{ + cpi->cap_mask2_resp_time = + (cpi->cap_mask2_resp_time & + cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) | + cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK); +} + +/** + * ib_get_cpi_capmask2 - Returns the capmask2 value from + * cap_mask2_resp_time in ib_class_port_info. + * @cpi: A struct ib_class_port_info mad. + */ +static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi) +{ + return (be32_to_cpu(cpi->cap_mask2_resp_time) >> + IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); +} + +/** + * ib_set_cpi_capmask2 - Sets the capmask2 in an + * ib_class_port_info mad. + * @cpi: A struct ib_class_port_info. + * @capmask2: The capmask2 to set. + */ +static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi, + u32 capmask2) +{ + cpi->cap_mask2_resp_time = + (cpi->cap_mask2_resp_time & + cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) | + cpu_to_be32(capmask2 << + IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); +} + +struct ib_mad_notice_attr { + u8 generic_type; + u8 prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + __be16 issuer_lid; + __be16 toggle_count; + + union { + struct { + u8 details[54]; + } raw_data; + + struct { + __be16 reserved; + __be16 lid; /* where violation happened */ + u8 port_num; /* where violation happened */ + } __packed ntc_129_131; + + struct { + __be16 reserved; + __be16 lid; /* LID where change occurred */ + u8 reserved2; + u8 local_changes; /* low bit - local changes */ + __be32 new_cap_mask; /* new capability mask */ + u8 reserved3; + u8 change_flags; /* low 3 bits only */ + } __packed ntc_144; + + struct { + __be16 reserved; + __be16 lid; /* lid where sys guid changed */ + __be16 reserved2; + __be64 new_sys_guid; + } __packed ntc_145; + + struct { + __be16 reserved; + __be16 lid; + __be16 dr_slid; + u8 method; + u8 reserved2; + __be16 attr_id; + __be32 attr_mod; + __be64 mkey; + u8 reserved3; + u8 dr_trunc_hop; + u8 dr_rtn_path[30]; + } __packed ntc_256; + + struct { + __be16 reserved; + __be16 lid1; + __be16 lid2; + __be32 key; + __be32 sl_qp1; /* SL: high 4 bits */ + __be32 qp2; /* high 8 bits reserved */ + union ib_gid gid1; + union ib_gid gid2; + } __packed ntc_257_258; + + } details; +}; + +/** * ib_mad_send_buf - MAD data buffer and work request for sends. * @next: A pointer used to chain together MADs for posting. * @mad: References an allocated MAD data buffer for MADs that do not have @@ -234,7 +392,10 @@ * includes the common MAD, RMPP, and class specific headers. * @data_len: Indicates the total size of user-transferred data. * @seg_count: The number of RMPP segments allocated for this send. - * @seg_size: Size of each RMPP segment. + * @seg_size: Size of the data in each RMPP segment. This does not include + * class specific headers. + * @seg_rmpp_size: Size of each RMPP segment including the class specific + * headers. * @timeout_ms: Time to wait for a response. * @retries: Number of times to retry a request for a response. For MADs * using RMPP, this applies per window. On completion, returns the number @@ -254,6 +415,7 @@ int data_len; int seg_count; int seg_size; + int seg_rmpp_size; int timeout_ms; int retries; }; @@ -262,7 +424,7 @@ * ib_response_mad - Returns if the specified MAD has been generated in * response to a sent request or trap. */ -int ib_response_mad(struct ib_mad *mad); +int ib_response_mad(const struct ib_mad_hdr *hdr); /** * ib_get_rmpp_resptime - Returns the RMPP response time. @@ -277,7 +439,7 @@ * ib_get_rmpp_flags - Returns the RMPP flags. * @rmpp_hdr: An RMPP header. */ -static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr) +static inline u8 ib_get_rmpp_flags(const struct ib_rmpp_hdr *rmpp_hdr) { return rmpp_hdr->rmpp_rtime_flags & 0x7; } @@ -318,11 +480,11 @@ /** * ib_mad_snoop_handler - Callback handler for snooping sent MADs. * @mad_agent: MAD agent that snooped the MAD. - * @send_wr: Work request information on the sent MAD. + * @send_buf: send MAD data buffer. * @mad_send_wc: Work completion information on the sent MAD. Valid * only for snooping that occurs on a send completion. * - * Clients snooping MADs should not modify data referenced by the @send_wr + * Clients snooping MADs should not modify data referenced by the @send_buf * or @mad_send_wc. */ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, @@ -332,6 +494,7 @@ /** * ib_mad_recv_handler - callback handler for a received MAD. * @mad_agent: MAD agent requesting the received MAD. + * @send_buf: Send buffer if found, else NULL * @mad_recv_wc: Received work completion information on the received MAD. * * MADs received in response to a send request operation will be handed to @@ -341,6 +504,7 @@ * modify the data referenced by @mad_recv_wc. */ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc); /** @@ -355,18 +519,22 @@ * @hi_tid: Access layer assigned transaction ID for this client. * Unsolicited MADs sent by this client will have the upper 32-bits * of their TID set to this value. + * @flags: registration flags * @port_num: Port number on which QP is registered * @rmpp_version: If set, indicates the RMPP version used by this agent. */ +enum { + IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP, +}; struct ib_mad_agent { struct ib_device *device; struct ib_qp *qp; - struct ib_mr *mr; ib_mad_recv_handler recv_handler; ib_mad_send_handler send_handler; ib_mad_snoop_handler snoop_handler; void *context; u32 hi_tid; + u32 flags; u8 port_num; u8 rmpp_version; }; @@ -395,7 +563,10 @@ struct ib_mad_recv_buf { struct list_head list; struct ib_grh *grh; - struct ib_mad *mad; + union { + struct ib_mad *mad; + struct opa_mad *opa_mad; + }; }; /** @@ -404,6 +575,7 @@ * @recv_buf: Specifies the location of the received data buffer(s). * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers. * @mad_len: The length of the received MAD, without duplicated headers. + * @mad_seg_size: The size of individual MAD segments * * For received response, the wr_id contains a pointer to the ib_mad_send_buf * for the corresponding send request. @@ -413,6 +585,7 @@ struct ib_mad_recv_buf recv_buf; struct list_head rmpp_list; int mad_len; + size_t mad_seg_size; }; /** @@ -426,6 +599,7 @@ * in the range from 0x30 to 0x4f. Otherwise not used. * @method_mask: The caller will receive unsolicited MADs for any method * where @method_mask = 1. + * */ struct ib_mad_reg_req { u8 mgmt_class; @@ -451,6 +625,7 @@ * @recv_handler: The completion callback routine invoked for a received * MAD. * @context: User specified context associated with the registration. + * @registration_flags: Registration flags to set for this agent */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, @@ -459,7 +634,8 @@ u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, - void *context); + void *context, + u32 registration_flags); enum ib_mad_snoop_flags { /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/ @@ -609,6 +785,7 @@ * automatically adjust the allocated buffer size to account for any * additional padding that may be necessary. * @gfp_mask: GFP mask used for the memory allocation. + * @base_version: Base Version of this MAD * * This routine allocates a MAD for sending. The returned MAD send buffer * will reference a data buffer usable for sending a MAD, along @@ -624,7 +801,8 @@ u32 remote_qpn, u16 pkey_index, int rmpp_active, int hdr_len, int data_len, - gfp_t gfp_mask); + gfp_t gfp_mask, + u8 base_version); /** * ib_is_mad_class_rmpp - returns whether given management class @@ -661,4 +839,11 @@ */ void ib_free_send_mad(struct ib_mad_send_buf *send_buf); +/** + * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP. + * @agent: the agent in question + * @return: true if agent is performing rmpp, false otherwise. + */ +int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent); + #endif /* IB_MAD_H */ Index: sys/ofed/include/rdma/ib_pack.h =================================================================== --- sys/ofed/include/rdma/ib_pack.h +++ sys/ofed/include/rdma/ib_pack.h @@ -40,6 +40,8 @@ IB_ETH_BYTES = 14, IB_VLAN_BYTES = 4, IB_GRH_BYTES = 40, + IB_IP4_BYTES = 20, + IB_UDP_BYTES = 8, IB_BTH_BYTES = 12, IB_DETH_BYTES = 8 }; @@ -75,6 +77,8 @@ IB_OPCODE_UC = 0x20, IB_OPCODE_RD = 0x40, IB_OPCODE_UD = 0x60, + /* per IBTA 1.3 vol 1 Table 38, A10.3.2 */ + IB_OPCODE_CNP = 0x80, /* operations -- just used to define real constants */ IB_OPCODE_SEND_FIRST = 0x00, @@ -98,6 +102,9 @@ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, IB_OPCODE_COMPARE_SWAP = 0x13, IB_OPCODE_FETCH_ADD = 0x14, + /* opcode 0x15 is reserved */ + IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, + IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, /* real constants follow -- see comment about above IB_OPCODE() macro for more details */ @@ -124,6 +131,8 @@ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE), IB_OPCODE(RC, COMPARE_SWAP), IB_OPCODE(RC, FETCH_ADD), + IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), + IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), /* UC */ IB_OPCODE(UC, SEND_FIRST), @@ -220,6 +229,27 @@ __be16 type; }; +struct ib_unpacked_ip4 { + u8 ver; + u8 hdr_len; + u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + u8 ttl; + u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; +}; + +struct ib_unpacked_udp { + __be16 sport; + __be16 dport; + __be16 length; + __be16 csum; +}; + struct ib_unpacked_vlan { __be16 tag; __be16 type; @@ -228,16 +258,20 @@ struct ib_ud_header { int lrh_present; struct ib_unpacked_lrh lrh; - int eth_present; - struct ib_unpacked_eth eth; + int eth_present; + struct ib_unpacked_eth eth; int vlan_present; struct ib_unpacked_vlan vlan; - int grh_present; - struct ib_unpacked_grh grh; - struct ib_unpacked_bth bth; + int grh_present; + struct ib_unpacked_grh grh; + int ipv4_present; + struct ib_unpacked_ip4 ip4; + int udp_present; + struct ib_unpacked_udp udp; + struct ib_unpacked_bth bth; struct ib_unpacked_deth deth; - int immediate_present; - __be32 immediate_data; + int immediate_present; + __be32 immediate_data; }; void ib_pack(const struct ib_field *desc, @@ -250,13 +284,17 @@ void *buf, void *structure); -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header); +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header); + +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header); int ib_ud_header_pack(struct ib_ud_header *header, void *buf); Index: sys/ofed/include/rdma/ib_peer_mem.h =================================================================== --- sys/ofed/include/rdma/ib_peer_mem.h +++ /dev/null @@ -1,59 +0,0 @@ -#if !defined(IB_PEER_MEM_H) -#define IB_PEER_MEM_H - -#include - - -struct invalidation_ctx; -struct ib_ucontext; - -struct ib_peer_memory_statistics { - unsigned long num_alloc_mrs; - unsigned long num_dealloc_mrs; - unsigned long num_reg_pages; - unsigned long num_dereg_pages; - unsigned long num_free_callbacks; -}; - -struct ib_peer_memory_client { - const struct peer_memory_client *peer_mem; - - struct list_head core_peer_list; - struct list_head core_ticket_list; - unsigned long last_ticket; -#ifdef __FreeBSD__ - int holdcount; - int needwakeup; - struct cv peer_cv; -#else - struct srcu_struct peer_srcu; -#endif - struct mutex lock; - struct kobject *kobj; - struct attribute_group peer_mem_attr_group; - struct ib_peer_memory_statistics stats; -}; - -struct core_ticket { - unsigned long key; - void *context; - struct list_head ticket_list; -}; - -struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr, - size_t size, void **peer_client_context, - int *srcu_key); - -void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, - void *peer_client_context, - int srcu_key); - -unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, - void *context); -int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key); -struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client, - unsigned long key); -#endif - - Index: sys/ofed/include/rdma/ib_pma.h =================================================================== --- sys/ofed/include/rdma/ib_pma.h +++ sys/ofed/include/rdma/ib_pma.h @@ -37,30 +37,12 @@ #include -#define MAX_U32 0xffffffffULL -#define MAX_U16 0xffffUL - -/* Counters should be saturate once they reach their maximum value */ -#define ASSIGN_32BIT_COUNTER(counter, value) do { \ - if ((value) > MAX_U32) \ - counter = cpu_to_be32(MAX_U32); \ - else \ - counter = cpu_to_be32(value); \ -} while (0) - -/* Counters should be saturate once they reach their maximum value */ -#define ASSIGN_16BIT_COUNTER(counter, value) do { \ - if ((value) > MAX_U16) \ - counter = cpu_to_be16(MAX_U16); \ - else \ - counter = cpu_to_be16(value); \ -} while (0) - /* * PMA class portinfo capability mask bits */ #define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8) #define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9) +#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10) #define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12) #define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) Index: sys/ofed/include/rdma/ib_sa.h =================================================================== --- sys/ofed/include/rdma/ib_sa.h +++ sys/ofed/include/rdma/ib_sa.h @@ -37,8 +37,7 @@ #include #include - -#include +#include #include #include @@ -94,6 +93,21 @@ }; /* + * There are 4 types of join states: + * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. + * The order corresponds to JoinState bits in MCMemberRecord. + */ +enum ib_sa_mc_join_states { + FULLMEMBER_JOIN, + NONMEMBER_JOIN, + SENDONLY_NONMEBER_JOIN, + SENDONLY_FULLMEMBER_JOIN, + NUM_JOIN_MEMBERSHIP_TYPES, +}; + +#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12) + +/* * Structures for SA records are named "struct ib_sa_xxx_rec." No * attempt is made to pack structures to match the physical layout of * SA records in SA MADs; all packing and unpacking is handled by the @@ -154,11 +168,19 @@ u8 packet_life_time_selector; u8 packet_life_time; u8 preference; - u8 smac[ETH_ALEN]; - u8 dmac[6]; - __be16 vlan_id; + u8 dmac[ETH_ALEN]; + /* ignored in IB */ + int ifindex; + /* ignored in IB */ + struct vnet *net; + enum ib_gid_type gid_type; }; +static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec) +{ + return rec->net ? dev_get_by_index(rec->net, rec->ifindex) : NULL; +} + #define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) #define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1) #define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2) @@ -394,6 +416,8 @@ */ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr); /** @@ -405,6 +429,12 @@ struct ib_ah_attr *ah_attr); /** + * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec + * to IB MAD wire format. + */ +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute); + +/** * ib_sa_unpack_path - Convert a path record from MAD format to struct * ib_sa_path_rec. */ @@ -412,13 +442,24 @@ /* Support GuidInfoRecord */ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_guidinfo_rec *rec, - ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_guidinfo_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +/* Support get SA ClassPortInfo */ +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + #endif /* IB_SA_H */ Index: sys/ofed/include/rdma/ib_smi.h =================================================================== --- sys/ofed/include/rdma/ib_smi.h +++ sys/ofed/include/rdma/ib_smi.h @@ -38,7 +38,6 @@ #define IB_SMI_H #include -#include #define IB_SMP_DATA_SIZE 64 #define IB_SMP_MAX_PATH_HOPS 64 @@ -120,10 +119,57 @@ u8 link_roundtrip_latency[3]; }; +struct ib_node_info { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __packed; + +struct ib_vl_weight_elem { + u8 vl; /* IB: VL is low 4 bits, upper 4 bits reserved */ + /* OPA: VL is low 5 bits, upper 3 bits reserved */ + u8 weight; +}; + static inline u8 -ib_get_smp_direction(struct ib_smp *smp) +ib_get_smp_direction(const struct ib_smp *smp) { return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION); } +/* + * SM Trap/Notice numbers + */ +#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129) +#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130) +#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131) +#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144) +#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145) +#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256) +#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257) +#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258) + +/* + * Other local changes flags (trap 144). + */ +#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */ +#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */ +#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01 + +/* + * M_Key volation flags in dr_trunc_hop (trap 256). + */ +#define IB_NOTICE_TRAP_DR_NOTICE 0x80 +#define IB_NOTICE_TRAP_DR_TRUNC 0x40 + + #endif /* IB_SMI_H */ Index: sys/ofed/include/rdma/ib_umem.h =================================================================== --- sys/ofed/include/rdma/ib_umem.h +++ sys/ofed/include/rdma/ib_umem.h @@ -36,58 +36,73 @@ #include #include #include -#include -#include -#include struct ib_ucontext; -struct ib_umem; - -typedef void (*umem_invalidate_func_t)(void *invalidation_cookie, - struct ib_umem *umem, - unsigned long addr, size_t size); - -struct invalidation_ctx { - struct ib_umem *umem; - umem_invalidate_func_t func; - void *cookie; - unsigned long context_ticket; - int peer_callback; - int inflight_invalidation; - int peer_invalidated; - struct completion comp; -}; +struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; size_t length; - int offset; + unsigned long address; int page_size; int writable; - int hugetlb; struct work_struct work; + pid_t pid; + struct mm_struct *mm; unsigned long diff; - unsigned long start; + struct ib_umem_odp *odp_data; struct sg_table sg_head; - int nmap; + int nmap; int npages; - /* peer memory that manages this umem*/ - struct ib_peer_memory_client *ib_peer_mem; - struct invalidation_ctx *invalidation_ctx; - int peer_mem_srcu_key; - /* peer memory private context */ - void *peer_mem_client_context; }; +/* Returns the offset of the umem start relative to the first page. */ +static inline int ib_umem_offset(struct ib_umem *umem) +{ + return umem->address & ((unsigned long)umem->page_size - 1); +} + +/* Returns the first page of an ODP umem. */ +static inline unsigned long ib_umem_start(struct ib_umem *umem) +{ + return umem->address - ib_umem_offset(umem); +} + +/* Returns the address of the page after the last one of an ODP umem. */ +static inline unsigned long ib_umem_end(struct ib_umem *umem) +{ + return PAGE_ALIGN(umem->address + umem->length); +} + +static inline size_t ib_umem_num_pages(struct ib_umem *umem) +{ + return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT; +} + +#ifdef CONFIG_INFINIBAND_USER_MEM + struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync); -struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync, - int invalidation_supported); -void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, - umem_invalidate_func_t func, - void *cookie); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length); + +#else /* CONFIG_INFINIBAND_USER_MEM */ + +#include + +static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, + unsigned long addr, size_t size, + int access, int dmasync) { + return ERR_PTR(-EINVAL); +} +static inline void ib_umem_release(struct ib_umem *umem) { } +static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } +static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) { + return -EINVAL; +} +#endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ Index: sys/ofed/include/rdma/ib_umem_odp.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/ib_umem_odp.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_UMEM_ODP_H +#define IB_UMEM_ODP_H + +#include + +#include +#include + +struct umem_odp_node { + u64 __subtree_last; + struct rb_node rb; +}; + +struct ib_umem_odp { + /* + * An array of the pages included in the on-demand paging umem. + * Indices of pages that are currently not mapped into the device will + * contain NULL. + */ + struct page **page_list; + /* + * An array of the same size as page_list, with DMA addresses mapped + * for pages the pages in page_list. The lower two bits designate + * access permissions. See ODP_READ_ALLOWED_BIT and + * ODP_WRITE_ALLOWED_BIT. + */ + dma_addr_t *dma_list; + /* + * The umem_mutex protects the page_list and dma_list fields of an ODP + * umem, allowing only a single thread to map/unmap pages. The mutex + * also protects access to the mmu notifier counters. + */ + struct mutex umem_mutex; + void *private; /* for the HW driver to use. */ + + /* When false, use the notifier counter in the ucontext struct. */ + bool mn_counters_active; + int notifiers_seq; + int notifiers_count; + + /* A linked list of umems that don't have private mmu notifier + * counters yet. */ + struct list_head no_private_counters; + struct ib_umem *umem; + + /* Tree tracking */ + struct umem_odp_node interval_tree; + + struct completion notifier_completion; + int dying; +}; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); + +void ib_umem_odp_release(struct ib_umem *umem); + +/* + * The lower 2 bits of the DMA address signal the R/W permissions for + * the entry. To upgrade the permissions, provide the appropriate + * bitmask to the map_dma_pages function. + * + * Be aware that upgrading a mapped address might result in change of + * the DMA address for the page. + */ +#define ODP_READ_ALLOWED_BIT (1<<0ULL) +#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) + +#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) + +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, + u64 access_mask, unsigned long current_seq); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, + u64 bound); + +void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); +void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); +typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, + void *cookie); +/* + * Call the callback on each ib_umem in the range. Returns the logical or of + * the return values of the functions called. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, + umem_call_back cb, void *cookie); + +struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, + u64 start, u64 last); +struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, + u64 start, u64 last); + +static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, + unsigned long mmu_seq) +{ + /* + * This code is strongly based on the KVM code from + * mmu_notifier_retry. Should be called with + * the relevant locks taken (item->odp_data->umem_mutex + * and the ucontext umem_mutex semaphore locked for read). + */ + + /* Do not allow page faults while the new ib_umem hasn't seen a state + * with zero notifiers yet, and doesn't have its own valid set of + * private counters. */ + if (!item->odp_data->mn_counters_active) + return 1; + + if (unlikely(item->odp_data->notifiers_count)) + return 1; + if (item->odp_data->notifiers_seq != mmu_seq) + return 1; + return 0; +} + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +static inline int ib_umem_odp_get(struct ib_ucontext *context, + struct ib_umem *umem) +{ + return -EINVAL; +} + +static inline void ib_umem_odp_release(struct ib_umem *umem) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +#endif /* IB_UMEM_ODP_H */ Index: sys/ofed/include/rdma/ib_user_verbs_exp.h =================================================================== --- sys/ofed/include/rdma/ib_user_verbs_exp.h +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * Copyright (c) 2006 Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IB_USER_VERBS_EXP_H -#define IB_USER_VERBS_EXP_H - -#include - -enum { - IB_USER_VERBS_EXP_CMD_FIRST = 64 -}; - -enum { - IB_USER_VERBS_EXP_CMD_CREATE_QP, - IB_USER_VERBS_EXP_CMD_MODIFY_CQ, - IB_USER_VERBS_EXP_CMD_MODIFY_QP, - IB_USER_VERBS_EXP_CMD_CREATE_CQ, - IB_USER_VERBS_EXP_CMD_QUERY_DEVICE, - IB_USER_VERBS_EXP_CMD_CREATE_DCT, - IB_USER_VERBS_EXP_CMD_DESTROY_DCT, - IB_USER_VERBS_EXP_CMD_QUERY_DCT, -}; - -/* - * Make sure that all structs defined in this file remain laid out so - * that they pack the same way on 32-bit and 64-bit architectures (to - * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * Specifically: - * - Do not use pointer types -- pass pointers in __u64 instead. - * - Make sure that any structure larger than 4 bytes is padded to a - * multiple of 8 bytes. Otherwise the structure size will be - * different between 32-bit and 64-bit architectures. - */ - -enum ib_uverbs_exp_create_qp_comp_mask { - IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS = (1ULL << 0), - IB_UVERBS_EXP_CREATE_QP_INL_RECV = (1ULL << 1), - IB_UVERBS_EXP_CREATE_QP_QPG = (1ULL << 2) -}; - -struct ib_uverbs_qpg_init_attrib { - __u32 tss_child_count; - __u32 rss_child_count; -}; - -struct ib_uverbs_qpg { - __u32 qpg_type; - union { - struct { - __u32 parent_handle; - __u32 reserved; - }; - struct ib_uverbs_qpg_init_attrib parent_attrib; - }; - __u32 reserved2; -}; - -struct ib_uverbs_exp_create_qp { - __u64 comp_mask; - __u64 user_handle; - __u32 pd_handle; - __u32 send_cq_handle; - __u32 recv_cq_handle; - __u32 srq_handle; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u8 sq_sig_all; - __u8 qp_type; - __u8 is_srq; - __u8 reserved; - __u64 qp_cap_flags; - __u32 max_inl_recv; - __u32 reserved1; - struct ib_uverbs_qpg qpg; - __u64 driver_data[0]; -}; - -enum ib_uverbs_exp_create_qp_resp_comp_mask { - IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV = (1ULL << 0), -}; - -struct ib_uverbs_exp_create_qp_resp { - __u64 comp_mask; - __u32 qp_handle; - __u32 qpn; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u32 max_inl_recv; -}; - -struct ib_uverbs_create_dct { - __u64 comp_mask; - __u64 user_handle; - __u32 pd_handle; - __u32 cq_handle; - __u32 srq_handle; - __u32 access_flags; - __u32 flow_label; - __u64 dc_key; - __u8 min_rnr_timer; - __u8 tclass; - __u8 port; - __u8 pkey_index; - __u8 gid_index; - __u8 hop_limit; - __u8 mtu; - __u8 rsvd; - __u32 create_flags; - __u64 driver_data[0]; -}; - -struct ib_uverbs_create_dct_resp { - __u32 dct_handle; - __u32 dctn; -}; - -struct ib_uverbs_destroy_dct { - __u64 comp_mask; - __u64 user_handle; -}; - -struct ib_uverbs_destroy_dct_resp { - __u64 reserved; -}; - -struct ib_uverbs_query_dct { - __u64 comp_mask; - __u64 dct_handle; - __u64 driver_data[0]; -}; - -struct ib_uverbs_query_dct_resp { - __u64 dc_key; - __u32 access_flags; - __u32 flow_label; - __u32 key_violations; - __u8 port; - __u8 min_rnr_timer; - __u8 tclass; - __u8 mtu; - __u8 pkey_index; - __u8 gid_index; - __u8 hop_limit; - __u8 state; - __u32 rsvd; - __u64 driver_data[0]; -}; - -struct ib_uverbs_exp_query_device { - __u64 comp_mask; - __u64 driver_data[0]; -}; - -struct ib_uverbs_exp_query_device_resp { - __u64 comp_mask; - struct ib_uverbs_query_device_resp base; - __u64 timestamp_mask; - __u64 hca_core_clock; - __u64 device_cap_flags2; - __u32 dc_rd_req; - __u32 dc_rd_res; - __u32 inline_recv_sz; - __u32 max_rss_tbl_sz; -}; - -#endif /* IB_USER_VERBS_EXP_H */ Index: sys/ofed/include/rdma/ib_verbs.h =================================================================== --- sys/ofed/include/rdma/ib_verbs.h +++ sys/ofed/include/rdma/ib_verbs.h @@ -48,12 +48,24 @@ #include #include #include +#include #include -#include - +#include +#include +#include +#include +#include +#include +#include + +#include #include +struct ifla_vf_info; +struct ifla_vf_stats; + extern struct workqueue_struct *ib_wq; +extern struct workqueue_struct *ib_comp_wq; union ib_gid { u8 raw[16]; @@ -63,50 +75,116 @@ } global; }; +extern union ib_gid zgid; + +enum ib_gid_type { + /* If link layer is Ethernet, this is RoCE V1 */ + IB_GID_TYPE_IB = 0, + IB_GID_TYPE_ROCE = 0, + IB_GID_TYPE_ROCE_UDP_ENCAP = 1, + IB_GID_TYPE_SIZE +}; + +#define ROCE_V2_UDP_DPORT 4791 +struct ib_gid_attr { + enum ib_gid_type gid_type; + struct net_device *ndev; +}; + enum rdma_node_type { /* IB values map to NodeInfo:NodeType. */ RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, RDMA_NODE_RNIC, - RDMA_NODE_MIC + RDMA_NODE_USNIC, + RDMA_NODE_USNIC_UDP, +}; + +enum { + /* set the local administered indication */ + IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, - RDMA_TRANSPORT_SCIF + RDMA_TRANSPORT_USNIC, + RDMA_TRANSPORT_USNIC_UDP +}; + +enum rdma_protocol_type { + RDMA_PROTOCOL_IB, + RDMA_PROTOCOL_IBOE, + RDMA_PROTOCOL_IWARP, + RDMA_PROTOCOL_USNIC_UDP }; -enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__; +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type); + +enum rdma_network_type { + RDMA_NETWORK_IB, + RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB, + RDMA_NETWORK_IPV4, + RDMA_NETWORK_IPV6 +}; + +static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) +{ + if (network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */ + return IB_GID_TYPE_IB; +} + +static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type, + union ib_gid *gid) +{ + if (gid_type == IB_GID_TYPE_IB) + return RDMA_NETWORK_IB; + + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) + return RDMA_NETWORK_IPV4; + else + return RDMA_NETWORK_IPV6; +} enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, - IB_LINK_LAYER_SCIF }; enum ib_device_cap_flags { - IB_DEVICE_RESIZE_MAX_WR = 1, - IB_DEVICE_BAD_PKEY_CNTR = (1<<1), - IB_DEVICE_BAD_QKEY_CNTR = (1<<2), - IB_DEVICE_RAW_MULTI = (1<<3), - IB_DEVICE_AUTO_PATH_MIG = (1<<4), - IB_DEVICE_CHANGE_PHY_PORT = (1<<5), - IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6), - IB_DEVICE_CURR_QP_STATE_MOD = (1<<7), - IB_DEVICE_SHUTDOWN_PORT = (1<<8), - IB_DEVICE_INIT_TYPE = (1<<9), - IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10), - IB_DEVICE_SYS_IMAGE_GUID = (1<<11), - IB_DEVICE_RC_RNR_NAK_GEN = (1<<12), - IB_DEVICE_SRQ_RESIZE = (1<<13), - IB_DEVICE_N_NOTIFY_CQ = (1<<14), - IB_DEVICE_LOCAL_DMA_LKEY = (1<<15), - IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */ - IB_DEVICE_MEM_WINDOW = (1<<17), + IB_DEVICE_RESIZE_MAX_WR = (1 << 0), + IB_DEVICE_BAD_PKEY_CNTR = (1 << 1), + IB_DEVICE_BAD_QKEY_CNTR = (1 << 2), + IB_DEVICE_RAW_MULTI = (1 << 3), + IB_DEVICE_AUTO_PATH_MIG = (1 << 4), + IB_DEVICE_CHANGE_PHY_PORT = (1 << 5), + IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), + IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), + IB_DEVICE_SHUTDOWN_PORT = (1 << 8), + IB_DEVICE_INIT_TYPE = (1 << 9), + IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), + IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), + IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), + IB_DEVICE_SRQ_RESIZE = (1 << 13), + IB_DEVICE_N_NOTIFY_CQ = (1 << 14), + + /* + * This device supports a per-device lkey or stag that can be + * used without performing a memory registration for the local + * memory. Note that ULPs should never check this flag, but + * instead of use the local_dma_lkey flag in the ib_pd structure, + * which will always contain a usable lkey. + */ + IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), + IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16), + IB_DEVICE_MEM_WINDOW = (1 << 17), /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB @@ -114,27 +192,38 @@ * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ - IB_DEVICE_UD_IP_CSUM = (1<<18), - IB_DEVICE_UD_TSO = (1<<19), - IB_DEVICE_XRC = (1<<20), - IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), - IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), - IB_DEVICE_MR_ALLOCATE = (1<<23), - IB_DEVICE_SHARED_MR = (1<<24), - IB_DEVICE_QPG = (1<<25), - IB_DEVICE_UD_RSS = (1<<26), - IB_DEVICE_UD_TSS = (1<<27), - IB_DEVICE_CROSS_CHANNEL = (1<<28), - IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), + IB_DEVICE_UD_IP_CSUM = (1 << 18), + IB_DEVICE_UD_TSO = (1 << 19), + IB_DEVICE_XRC = (1 << 20), + /* - * Devices can set either IB_DEVICE_MEM_WINDOW_TYPE_2A or - * IB_DEVICE_MEM_WINDOW_TYPE_2B if it supports type 2A or type 2B - * memory windows. It can set neither to indicate it doesn't support - * type 2 windows at all. + * This device supports the IB "base memory management extension", + * which includes support for fast registrations (IB_WR_REG_MR, + * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should + * also be set by any iWarp device which must support FRs to comply + * to the iWarp verbs spec. iWarp devices also support the + * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the + * stag. */ - IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<30), - IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<31), - IB_DEVICE_SIGNATURE_HANDOVER = (1LL<<32) + IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21), + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22), + IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23), + IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24), + IB_DEVICE_RC_IP_CSUM = (1 << 25), + IB_DEVICE_RAW_IP_CSUM = (1 << 26), + /* + * Devices should set IB_DEVICE_CROSS_CHANNEL if they + * support execution of WQEs that involve synchronization + * of I/O operations with single completion queue managed + * by hardware. + */ + IB_DEVICE_CROSS_CHANNEL = (1 << 27), + IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), + IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), + IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), + IB_DEVICE_SG_GAPS_REG = (1ULL << 32), + IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), + IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34), }; enum ib_signature_prot_cap { @@ -154,10 +243,46 @@ IB_ATOMIC_GLOB }; -enum ib_cq_create_flags { - IB_CQ_CREATE_CROSS_CHANNEL = 1 << 0, - IB_CQ_TIMESTAMP = 1 << 1, - IB_CQ_TIMESTAMP_TO_SYS_TIME = 1 << 2 +enum ib_odp_general_cap_bits { + IB_ODP_SUPPORT = 1 << 0, +}; + +enum ib_odp_transport_cap_bits { + IB_ODP_SUPPORT_SEND = 1 << 0, + IB_ODP_SUPPORT_RECV = 1 << 1, + IB_ODP_SUPPORT_WRITE = 1 << 2, + IB_ODP_SUPPORT_READ = 1 << 3, + IB_ODP_SUPPORT_ATOMIC = 1 << 4, +}; + +struct ib_odp_caps { + uint64_t general_caps; + struct { + uint32_t rc_odp_caps; + uint32_t uc_odp_caps; + uint32_t ud_odp_caps; + } per_transport_caps; +}; + +struct ib_rss_caps { + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + u32 supported_qpts; + u32 max_rwq_indirection_tables; + u32 max_rwq_indirection_table_size; +}; + +enum ib_cq_creation_flags { + IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, + IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, +}; + +struct ib_cq_init_attr { + unsigned int cqe; + int comp_vector; + u32 flags; }; struct ib_device_attr { @@ -199,19 +324,15 @@ int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; - int max_rss_tbl_sz; u16 max_pkeys; u8 local_ca_ack_delay; - int comp_mask; - uint64_t timestamp_mask; - uint64_t hca_core_clock; - unsigned int sig_prot_cap; - unsigned int sig_guard_cap; -}; - -enum ib_device_attr_comp_mask { - IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1, - IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2 + int sig_prot_cap; + int sig_guard_cap; + struct ib_odp_caps odp_caps; + uint64_t timestamp_mask; + uint64_t hca_core_clock; /* in KHZ */ + struct ib_rss_caps rss_caps; + u32 max_wq_type_rq; }; enum ib_mtu { @@ -241,7 +362,7 @@ IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5, - IB_PORT_DUMMY = -1 /* force enum signed */ + IB_PORT_DUMMY = -1, /* force enum signed */ }; enum ib_port_cap_flags { @@ -267,7 +388,8 @@ IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, IB_PORT_BOOT_MGMT_SUP = 1 << 23, IB_PORT_LINK_LATENCY_SUP = 1 << 24, - IB_PORT_CLIENT_REG_SUP = 1 << 25 + IB_PORT_CLIENT_REG_SUP = 1 << 25, + IB_PORT_IP_BASED_GIDS = 1 << 26, }; enum ib_port_width { @@ -297,58 +419,101 @@ IB_SPEED_EDR = 32 }; -struct ib_protocol_stats { - /* TBD... */ -}; - -struct iw_protocol_stats { - u64 ipInReceives; - u64 ipInHdrErrors; - u64 ipInTooBigErrors; - u64 ipInNoRoutes; - u64 ipInAddrErrors; - u64 ipInUnknownProtos; - u64 ipInTruncatedPkts; - u64 ipInDiscards; - u64 ipInDelivers; - u64 ipOutForwDatagrams; - u64 ipOutRequests; - u64 ipOutDiscards; - u64 ipOutNoRoutes; - u64 ipReasmTimeout; - u64 ipReasmReqds; - u64 ipReasmOKs; - u64 ipReasmFails; - u64 ipFragOKs; - u64 ipFragFails; - u64 ipFragCreates; - u64 ipInMcastPkts; - u64 ipOutMcastPkts; - u64 ipInBcastPkts; - u64 ipOutBcastPkts; - - u64 tcpRtoAlgorithm; - u64 tcpRtoMin; - u64 tcpRtoMax; - u64 tcpMaxConn; - u64 tcpActiveOpens; - u64 tcpPassiveOpens; - u64 tcpAttemptFails; - u64 tcpEstabResets; - u64 tcpCurrEstab; - u64 tcpInSegs; - u64 tcpOutSegs; - u64 tcpRetransSegs; - u64 tcpInErrs; - u64 tcpOutRsts; -}; - -union rdma_protocol_stats { - struct ib_protocol_stats ib; - struct iw_protocol_stats iw; -}; +/** + * struct rdma_hw_stats + * @timestamp - Used by the core code to track when the last update was + * @lifespan - Used by the core code to determine how old the counters + * should be before being updated again. Stored in jiffies, defaults + * to 10 milliseconds, drivers can override the default be specifying + * their own value during their allocation routine. + * @name - Array of pointers to static names used for the counters in + * directory. + * @num_counters - How many hardware counters there are. If name is + * shorter than this number, a kernel oops will result. Driver authors + * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) + * in their code to prevent this. + * @value - Array of u64 counters that are accessed by the sysfs code and + * filled in by the drivers get_stats routine + */ +struct rdma_hw_stats { + unsigned long timestamp; + unsigned long lifespan; + const char * const *names; + int num_counters; + u64 value[]; +}; + +#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 +/** + * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct + * for drivers. + * @names - Array of static const char * + * @num_counters - How many elements in array + * @lifespan - How many milliseconds between updates + */ +static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct( + const char * const *names, int num_counters, + unsigned long lifespan) +{ + struct rdma_hw_stats *stats; + + stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64), + GFP_KERNEL); + if (!stats) + return NULL; + stats->names = names; + stats->num_counters = num_counters; + stats->lifespan = msecs_to_jiffies(lifespan); + + return stats; +} + + +/* Define bits for the various functionality this port needs to be supported by + * the core. + */ +/* Management 0x00000FFF */ +#define RDMA_CORE_CAP_IB_MAD 0x00000001 +#define RDMA_CORE_CAP_IB_SMI 0x00000002 +#define RDMA_CORE_CAP_IB_CM 0x00000004 +#define RDMA_CORE_CAP_IW_CM 0x00000008 +#define RDMA_CORE_CAP_IB_SA 0x00000010 +#define RDMA_CORE_CAP_OPA_MAD 0x00000020 + +/* Address format 0x000FF000 */ +#define RDMA_CORE_CAP_AF_IB 0x00001000 +#define RDMA_CORE_CAP_ETH_AH 0x00002000 + +/* Protocol 0xFFF00000 */ +#define RDMA_CORE_CAP_PROT_IB 0x00100000 +#define RDMA_CORE_CAP_PROT_ROCE 0x00200000 +#define RDMA_CORE_CAP_PROT_IWARP 0x00400000 +#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 + +#define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_SMI \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_IB_SA \ + | RDMA_CORE_CAP_AF_IB) +#define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ + (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ + | RDMA_CORE_CAP_IW_CM) +#define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ + | RDMA_CORE_CAP_OPA_MAD) struct ib_port_attr { + u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; @@ -368,6 +533,7 @@ u8 active_width; u8 active_speed; u8 phys_state; + bool grh_required; }; enum ib_device_modify_flags { @@ -375,9 +541,11 @@ IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; +#define IB_DEVICE_NODE_DESC_MAX 64 + struct ib_device_modify { u64 sys_image_guid; - char node_desc[64]; + char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { @@ -412,14 +580,18 @@ IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, + IB_EVENT_WQ_FATAL, }; +const char *__attribute_const__ ib_event_msg(enum ib_event_type event); + struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; + struct ib_wq *wq; u8 port_num; } element; enum ib_event_type event; @@ -455,11 +627,23 @@ union ib_gid dgid; }; +union rdma_network_hdr { + struct ib_grh ibgrh; + struct { + /* The IB spec states that if it's IPv4, the header + * is located in the last 20 bytes of the header. + */ + u8 reserved[20]; + struct ip roce4grh; + }; +}; + enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) +#define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 @@ -486,57 +670,48 @@ IB_RATE_300_GBPS = 18 }; -enum ib_mr_create_flags { - IB_MR_SIGNATURE_EN = 1, -}; - -/** - * ib_mr_init_attr - Memory region init attributes passed to routine - * ib_create_mr. - * @max_reg_descriptors: max number of registration descriptors that - * may be used with registration work requests. - * @flags: MR creation flags bit mask. - */ -struct ib_mr_init_attr { - int max_reg_descriptors; - u32 flags; -}; - /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ -int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ -int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); -struct ib_cq_init_attr { - int cqe; - int comp_vector; - u32 flags; -}; -enum ib_signature_type { - IB_SIG_TYPE_T10_DIF, +/** + * enum ib_mr_type - memory region type + * @IB_MR_TYPE_MEM_REG: memory region that is used for + * normal registration + * @IB_MR_TYPE_SIGNATURE: memory region that is used for + * signature operations (data-integrity + * capable regions) + * @IB_MR_TYPE_SG_GAPS: memory region that is capable to + * register any arbitrary sg lists (without + * the normal mr constraints - see + * ib_map_mr_sg) + */ +enum ib_mr_type { + IB_MR_TYPE_MEM_REG, + IB_MR_TYPE_SIGNATURE, + IB_MR_TYPE_SG_GAPS, }; /** - * T10-DIF Signature types - * T10-DIF types are defined by SCSI - * specifications. + * Signature types + * IB_SIG_TYPE_NONE: Unprotected. + * IB_SIG_TYPE_T10_DIF: Type T10-DIF */ -enum ib_t10_dif_type { - IB_T10DIF_NONE, - IB_T10DIF_TYPE1, - IB_T10DIF_TYPE2, - IB_T10DIF_TYPE3 +enum ib_signature_type { + IB_SIG_TYPE_NONE, + IB_SIG_TYPE_T10_DIF, }; /** @@ -552,24 +727,26 @@ /** * struct ib_t10_dif_domain - Parameters specific for T10-DIF * domain. - * @type: T10-DIF type (0|1|2|3) * @bg_type: T10-DIF block guard type (CRC|CSUM) * @pi_interval: protection information interval. * @bg: seed of guard computation. * @app_tag: application tag of guard block * @ref_tag: initial guard block reference tag. - * @type3_inc_reftag: T10-DIF type 3 does not state - * about the reference tag, it is the user - * choice to increment it or not. + * @ref_remap: Indicate wethear the reftag increments each block + * @app_escape: Indicate to skip block check if apptag=0xffff + * @ref_escape: Indicate to skip block check if reftag=0xffffffff + * @apptag_check_mask: check bitmask of application tag. */ struct ib_t10_dif_domain { - enum ib_t10_dif_type type; enum ib_t10_dif_bg_type bg_type; - u32 pi_interval; + u16 pi_interval; u16 bg; u16 app_tag; u32 ref_tag; - bool type3_inc_reftag; + bool ref_remap; + bool app_escape; + bool ref_escape; + u16 apptag_check_mask; }; /** @@ -636,7 +813,7 @@ * enum. * @mult: multiple to convert. */ -enum ib_rate mult_to_ib_rate(int mult) __attribute_const__; +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct ib_ah_attr { struct ib_global_route grh; @@ -646,8 +823,7 @@ u8 static_rate; u8 ah_flags; u8 port_num; - u8 dmac[6]; - u16 vlan_id; + u8 dmac[ETH_ALEN]; }; enum ib_wc_status { @@ -675,16 +851,17 @@ IB_WC_GENERAL_ERR }; +const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); + enum ib_wc_opcode { IB_WC_SEND, IB_WC_RDMA_WRITE, IB_WC_RDMA_READ, IB_WC_COMP_SWAP, IB_WC_FETCH_ADD, - IB_WC_BIND_MW, IB_WC_LSO, IB_WC_LOCAL_INV, - IB_WC_FAST_REG_MR, + IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, /* @@ -692,7 +869,8 @@ * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, - IB_WC_RECV_RDMA_WITH_IMM + IB_WC_RECV_RDMA_WITH_IMM, + IB_WC_DUMMY = -1, /* force enum signed */ }; enum ib_wc_flags { @@ -700,15 +878,16 @@ IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), - IB_WC_WITH_SL = (1<<4), - IB_WC_WITH_SLID = (1<<5), - IB_WC_WITH_TIMESTAMP = (1<<6), - IB_WC_WITH_SMAC = (1<<7), - IB_WC_WITH_VLAN = (1<<8), + IB_WC_WITH_SMAC = (1<<4), + IB_WC_WITH_VLAN = (1<<5), + IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; @@ -725,12 +904,9 @@ u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ - int csum_ok; - struct { - uint64_t timestamp; /* timestamp = 0 indicates error*/ - } ts; - u8 smac[6]; + u8 smac[ETH_ALEN]; u16 vlan_id; + u8 network_hdr_type; }; enum ib_cq_notify_flags { @@ -776,7 +952,13 @@ u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; - u32 qpg_tss_mask_sz; + + /* + * Maximum number of rdma_rw_ctx structures in flight at a time. + * ib_create_qp() will calculate the right amount of neededed WRs + * and MRs based on this. + */ + u32 max_rdma_ctxs; }; enum ib_sig_type { @@ -801,7 +983,6 @@ IB_QPT_RAW_PACKET = 8, IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, - IB_QPT_DC_INI, IB_QPT_MAX, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the @@ -822,27 +1003,22 @@ enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, - IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, - IB_QP_CREATE_MANAGED_SEND = 1 << 3, - IB_QP_CREATE_MANAGED_RECV = 1 << 4, + IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, + IB_QP_CREATE_MANAGED_SEND = 1 << 3, + IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, + IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, + IB_QP_CREATE_SCATTER_FCS = 1 << 8, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; -enum ib_qpg_type { - IB_QPG_NONE = 0, - IB_QPG_PARENT = (1<<0), - IB_QPG_CHILD_RX = (1<<1), - IB_QPG_CHILD_TX = (1<<2) -}; - -struct ib_qpg_init_attrib { - u32 tss_child_count; - u32 rss_child_count; -}; +/* + * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler + * callback to destroy the passed in QP. + */ struct ib_qp_init_attr { void (*event_handler)(struct ib_event *, void *); @@ -852,52 +1028,15 @@ struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; - union { - struct ib_qp *qpg_parent; /* see qpg_type */ - struct ib_qpg_init_attrib parent_attrib; - }; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; - enum ib_qpg_type qpg_type; - u8 port_num; /* special QP types only */ -}; -enum { - IB_DCT_CREATE_FLAG_RCV_INLINE = 1 << 0, - IB_DCT_CREATE_FLAGS_MASK = IB_DCT_CREATE_FLAG_RCV_INLINE, -}; - -struct ib_dct_init_attr { - struct ib_pd *pd; - struct ib_cq *cq; - struct ib_srq *srq; - u64 dc_key; - u8 port; - u32 access_flags; - u8 min_rnr_timer; - u8 tclass; - u32 flow_label; - enum ib_mtu mtu; - u8 pkey_index; - u8 gid_index; - u8 hop_limit; - u32 create_flags; -}; - -struct ib_dct_attr { - u64 dc_key; - u8 port; - u32 access_flags; - u8 min_rnr_timer; - u8 tclass; - u32 flow_label; - enum ib_mtu mtu; - u8 pkey_index; - u8 gid_index; - u8 hop_limit; - u32 key_violations; - u8 state; + /* + * Only needed for special QP types, or when using the RW API. + */ + u8 port_num; + struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_qp_open_attr { @@ -964,12 +1103,10 @@ IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), - IB_QP_GROUP_RSS = (1<<21), - IB_QP_DC_KEY = (1<<22), - IB_QP_SMAC = (1<<23), - IB_QP_ALT_SMAC = (1<<24), - IB_QP_VID = (1<<25), - IB_QP_ALT_VID = (1<<26) + IB_QP_RESERVED1 = (1<<21), + IB_QP_RESERVED2 = (1<<22), + IB_QP_RESERVED3 = (1<<23), + IB_QP_RESERVED4 = (1<<24), }; enum ib_qp_state { @@ -980,7 +1117,7 @@ IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR, - IB_QPS_DUMMY = -1 /* force enum signed */ + IB_QPS_DUMMY = -1, /* force enum signed */ }; enum ib_mig_state { @@ -1020,40 +1157,6 @@ u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; - u8 smac[ETH_ALEN]; - u8 alt_smac[ETH_ALEN]; - u16 vlan_id; - u16 alt_vlan_id; - -}; - -struct ib_qp_attr_ex { - enum ib_qp_state qp_state; - enum ib_qp_state cur_qp_state; - enum ib_mtu path_mtu; - enum ib_mig_state path_mig_state; - u32 qkey; - u32 rq_psn; - u32 sq_psn; - u32 dest_qp_num; - int qp_access_flags; - struct ib_qp_cap cap; - struct ib_ah_attr ah_attr; - struct ib_ah_attr alt_ah_attr; - u16 pkey_index; - u16 alt_pkey_index; - u8 en_sqd_async_notify; - u8 sq_draining; - u8 max_rd_atomic; - u8 max_dest_rd_atomic; - u8 min_rnr_timer; - u8 port_num; - u8 timeout; - u8 retry_cnt; - u8 rnr_retry; - u8 alt_port_num; - u8 alt_timeout; - u64 dct_key; }; enum ib_wr_opcode { @@ -1068,10 +1171,9 @@ IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, - IB_WR_FAST_REG_MR, + IB_WR_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, - IB_WR_BIND_MW, IB_WR_REG_SIG_MR, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. @@ -1086,6 +1188,7 @@ IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, + IB_WR_DUMMY = -1, /* force enum signed */ }; enum ib_send_flags { @@ -1098,7 +1201,6 @@ /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), - IB_SEND_UMR_UNREG = (1<<5) }; struct ib_sge { @@ -1107,32 +1209,16 @@ u32 lkey; }; -struct ib_fast_reg_page_list { - struct ib_device *device; - u64 *page_list; - unsigned int max_page_list_len; -}; - -/** - * struct ib_mw_bind_info - Parameters for a memory window bind operation. - * @mr: A memory region to bind the memory window to. - * @addr: The address where the memory window should begin. - * @length: The length of the memory window, in bytes. - * @mw_access_flags: Access flags from enum ib_access_flags for the window. - * - * This struct contains the shared parameters for type 1 and type 2 - * memory window bind operations. - */ -struct ib_mw_bind_info { - struct ib_mr *mr; - u64 addr; - u64 length; - int mw_access_flags; +struct ib_cqe { + void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; @@ -1141,66 +1227,82 @@ __be32 imm_data; u32 invalidate_rkey; } ex; - union { - struct { - u64 remote_addr; - u32 rkey; - } rdma; - struct { - u64 remote_addr; - u64 compare_add; - u64 swap; - u64 compare_add_mask; - u64 swap_mask; - u32 rkey; - } atomic; - struct { - struct ib_ah *ah; - void *header; - int hlen; - int mss; - u32 remote_qpn; - u32 remote_qkey; - u16 pkey_index; /* valid for GSI only */ - u8 port_num; /* valid for DR SMPs on switch only */ - } ud; - struct { - u64 iova_start; - struct ib_fast_reg_page_list *page_list; - unsigned int page_shift; - unsigned int page_list_len; - u32 length; - int access_flags; - u32 rkey; - } fast_reg; - struct { - int npages; - int access_flags; - u32 mkey; - struct ib_pd *pd; - u64 virt_addr; - u64 length; - int page_shift; - } umr; - struct { - struct ib_mw *mw; - /* The new rkey for the memory window. */ - u32 rkey; - struct ib_mw_bind_info bind_info; - } bind_mw; - struct { - struct ib_sig_attrs *sig_attrs; - struct ib_mr *sig_mr; - int access_flags; - struct ib_sge *prot; - } sig_handover; - } wr; - u32 xrc_remote_srq_num; /* XRC TGT QPs only */ }; +struct ib_rdma_wr { + struct ib_send_wr wr; + u64 remote_addr; + u32 rkey; +}; + +static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_rdma_wr, wr); +} + +struct ib_atomic_wr { + struct ib_send_wr wr; + u64 remote_addr; + u64 compare_add; + u64 swap; + u64 compare_add_mask; + u64 swap_mask; + u32 rkey; +}; + +static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_atomic_wr, wr); +} + +struct ib_ud_wr { + struct ib_send_wr wr; + struct ib_ah *ah; + void *header; + int hlen; + int mss; + u32 remote_qpn; + u32 remote_qkey; + u16 pkey_index; /* valid for GSI only */ + u8 port_num; /* valid for DR SMPs on switch only */ +}; + +static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_ud_wr, wr); +} + +struct ib_reg_wr { + struct ib_send_wr wr; + struct ib_mr *mr; + u32 key; + int access; +}; + +static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_reg_wr, wr); +} + +struct ib_sig_handover_wr { + struct ib_send_wr wr; + struct ib_sig_attrs *sig_attrs; + struct ib_mr *sig_mr; + int access_flags; + struct ib_sge *prot; +}; + +static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_sig_handover_wr, wr); +} + struct ib_recv_wr { struct ib_recv_wr *next; - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; struct ib_sge *sg_list; int num_sge; }; @@ -1211,40 +1313,19 @@ IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), - IB_ACCESS_ALLOCATE_MR = (1<<5), - IB_ZERO_BASED = (1<<13) -}; - -struct ib_phys_buf { - u64 addr; - u64 size; -}; - -struct ib_mr_attr { - struct ib_pd *pd; - u64 device_virt_addr; - u64 size; - int mr_access_flags; - u32 lkey; - u32 rkey; + IB_ZERO_BASED = (1<<5), + IB_ACCESS_ON_DEMAND = (1<<6), }; +/* + * XXX: these are apparently used for ->rereg_user_mr, no idea why they + * are hidden here instead of a uapi header! + */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), - IB_MR_REREG_ACCESS = (1<<2) -}; - -/** - * struct ib_mw_bind - Parameters for a type 1 memory window bind operation. - * @wr_id: Work request id. - * @send_flags: Flags from ib_send_flags enum. - * @bind_info: More parameters of the bind operation. - */ -struct ib_mw_bind { - u64 wr_id; - int send_flags; - struct ib_mw_bind_info bind_info; + IB_MR_REREG_ACCESS = (1<<2), + IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; struct ib_fmr_attr { @@ -1253,6 +1334,8 @@ u8 page_shift; }; +struct ib_umem; + struct ib_ucontext { struct ib_device *device; struct list_head pd_list; @@ -1264,10 +1347,27 @@ struct list_head ah_list; struct list_head xrcd_list; struct list_head rule_list; - struct list_head dct_list; + struct list_head wq_list; + struct list_head rwq_ind_tbl_list; int closing; - void *peer_mem_private_data; - char *peer_mem_name; + + pid_t tgid; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct rb_root umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + void (*invalidate_range)(struct ib_umem *umem, + unsigned long start, unsigned long end); + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + int odp_mrs_count; +#endif }; struct ib_uobject { @@ -1278,36 +1378,37 @@ int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ + struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; -struct ib_udata; -struct ib_udata_ops { - int (*copy_from)(void *dest, struct ib_udata *udata, - size_t len); - int (*copy_to)(struct ib_udata *udata, void *src, - size_t len); -}; - struct ib_udata { - struct ib_udata_ops *ops; - void __user *inbuf; + const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { + u32 local_dma_lkey; + u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ + + u32 unsafe_global_rkey; + + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct ib_mr *__internal_mr; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; - + struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; }; @@ -1318,25 +1419,14 @@ struct ib_uobject *uobject; }; -enum ib_cq_attr_mask { - IB_CQ_MODERATION = (1 << 0), - IB_CQ_CAP_FLAGS = (1 << 1) -}; - -enum ib_cq_cap_flags { - IB_CQ_IGNORE_OVERRUN = (1 << 0) -}; +typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); -struct ib_cq_attr { - struct { - u16 cq_count; - u16 cq_period; - } moderation; - u32 cq_cap_flags; +enum ib_poll_context { + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ }; -typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); - struct ib_cq { struct ib_device *device; struct ib_uobject *uobject; @@ -1345,6 +1435,8 @@ void *cq_context; int cqe; atomic_t usecnt; /* count number of work queues */ + enum ib_poll_context poll_ctx; + struct work_struct work; }; struct ib_srq { @@ -1365,14 +1457,77 @@ } ext; }; +enum ib_wq_type { + IB_WQT_RQ +}; + +enum ib_wq_state { + IB_WQS_RESET, + IB_WQS_RDY, + IB_WQS_ERR +}; + +struct ib_wq { + struct ib_device *device; + struct ib_uobject *uobject; + void *wq_context; + void (*event_handler)(struct ib_event *, void *); + struct ib_pd *pd; + struct ib_cq *cq; + u32 wq_num; + enum ib_wq_state state; + enum ib_wq_type wq_type; + atomic_t usecnt; +}; + +struct ib_wq_init_attr { + void *wq_context; + enum ib_wq_type wq_type; + u32 max_wr; + u32 max_sge; + struct ib_cq *cq; + void (*event_handler)(struct ib_event *, void *); +}; + +enum ib_wq_attr_mask { + IB_WQ_STATE = 1 << 0, + IB_WQ_CUR_STATE = 1 << 1, +}; + +struct ib_wq_attr { + enum ib_wq_state wq_state; + enum ib_wq_state curr_wq_state; +}; + +struct ib_rwq_ind_table { + struct ib_device *device; + struct ib_uobject *uobject; + atomic_t usecnt; + u32 ind_tbl_num; + u32 log_ind_tbl_size; + struct ib_wq **ind_tbl; +}; + +struct ib_rwq_ind_table_init_attr { + u32 log_ind_tbl_size; + /* Each entry is a pointer to Receive Work Queue */ + struct ib_wq **ind_tbl; +}; + +/* + * @max_write_sge: Maximum SGE elements per RDMA WRITE request. + * @max_read_sge: Maximum SGE elements per RDMA READ request. + */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; + spinlock_t mr_lock; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; + /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; @@ -1381,27 +1536,25 @@ void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; + u32 max_write_sge; + u32 max_read_sge; enum ib_qp_type qp_type; - enum ib_qpg_type qpg_type; - u8 port_num; -}; - -struct ib_dct { - struct ib_device *device; - struct ib_uobject *uobject; - struct ib_pd *pd; - struct ib_cq *cq; - struct ib_srq *srq; - u32 dct_num; + struct ib_rwq_ind_table *rwq_ind_tbl; }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; - struct ib_uobject *uobject; u32 lkey; u32 rkey; - atomic_t usecnt; /* count number of MWs */ + u64 iova; + u32 length; + unsigned int page_size; + bool need_inval; + union { + struct ib_uobject *uobject; /* user */ + struct list_head qp_entry; /* FR */ + }; }; struct ib_mw { @@ -1440,14 +1593,15 @@ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, - IB_FLOW_SPEC_IB = 0x21, + IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, + IB_FLOW_SPEC_IPV6 = 0x31, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41 }; - +#define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 4 /* Flow steering rule priority is set according to it's domain. @@ -1462,7 +1616,8 @@ }; enum ib_flow_flags { - IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1 + IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ + IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ }; struct ib_flow_eth_filter { @@ -1470,6 +1625,8 @@ u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_eth { @@ -1480,8 +1637,10 @@ }; struct ib_flow_ib_filter { - __be32 l3_type_qpn; - u8 dst_gid[16]; + __be16 dlid; + __u8 sl; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_ib { @@ -1491,9 +1650,22 @@ struct ib_flow_ib_filter mask; }; +/* IPv4 header flags */ +enum ib_ipv4_flags { + IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ + IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the + last have this flag set */ +}; + struct ib_flow_ipv4_filter { - __be32 src_ip; - __be32 dst_ip; + __be32 src_ip; + __be32 dst_ip; + u8 proto; + u8 tos; + u8 ttl; + u8 flags; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_ipv4 { @@ -1503,9 +1675,29 @@ struct ib_flow_ipv4_filter mask; }; +struct ib_flow_ipv6_filter { + u8 src_ip[16]; + u8 dst_ip[16]; + __be32 flow_label; + u8 next_hdr; + u8 traffic_class; + u8 hop_limit; + /* Must be last */ + u8 real_sz[0]; +}; + +struct ib_flow_spec_ipv6 { + enum ib_flow_spec_type type; + u16 size; + struct ib_flow_ipv6_filter val; + struct ib_flow_ipv6_filter mask; +}; + struct ib_flow_tcp_udp_filter { - __be16 dst_port; + __be16 dst_port; __be16 src_port; + /* Must be last */ + u8 real_sz[0]; }; struct ib_flow_spec_tcp_udp { @@ -1520,19 +1712,20 @@ enum ib_flow_spec_type type; u16 size; }; - struct ib_flow_spec_ib ib; - struct ib_flow_spec_eth eth; - struct ib_flow_spec_ipv4 ipv4; - struct ib_flow_spec_tcp_udp tcp_udp; + struct ib_flow_spec_eth eth; + struct ib_flow_spec_ib ib; + struct ib_flow_spec_ipv4 ipv4; + struct ib_flow_spec_tcp_udp tcp_udp; + struct ib_flow_spec_ipv6 ipv6; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; + u32 flags; u8 num_of_specs; u8 port; - u32 flags; /* Following are the optional layers according to user request * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy @@ -1544,7 +1737,7 @@ struct ib_uobject *uobject; }; -struct ib_mad; +struct ib_mad_hdr; struct ib_grh; enum ib_process_mad_flags { @@ -1566,19 +1759,10 @@ rwlock_t lock; struct ib_event_handler event_handler; struct ib_pkey_cache **pkey_cache; - struct ib_gid_cache **gid_cache; + struct ib_gid_table **gid_cache; u8 *lmc_cache; }; -enum verbs_values_mask { - IBV_VALUES_HW_CLOCK = 1 << 0 -}; - -struct ib_device_values { - int values_mask; - uint64_t hwclock; -}; - struct ib_dma_mapping_ops { int (*mapping_error)(struct ib_device *dev, u64 dma_addr); @@ -1601,10 +1785,14 @@ void (*unmap_sg)(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); - u64 (*dma_address)(struct ib_device *dev, - struct scatterlist *sg); - unsigned int (*dma_len)(struct ib_device *dev, - struct scatterlist *sg); + int (*map_sg_attrs)(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*unmap_sg_attrs)(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, + struct dma_attrs *attrs); void (*sync_single_for_cpu)(struct ib_device *dev, u64 dma_handle, size_t size, @@ -1623,8 +1811,13 @@ }; struct iw_cm_verbs; -struct ib_exp_device_attr; -struct ib_exp_qp_init_attr; + +struct ib_port_immutable { + int pkey_tbl_len; + int gid_tbl_len; + u32 core_cap_flags; + u32 max_mad_size; +}; struct ib_device { struct device *dma_device; @@ -1636,36 +1829,92 @@ spinlock_t client_data_lock; struct list_head core_list; + /* Access to the client_data_list is protected by the client_data_lock + * spinlock and the lists_rwsem read-write semaphore */ struct list_head client_data_list; struct ib_cache cache; - int *pkey_tbl_len; - int *gid_tbl_len; + /** + * port_immutable is indexed by port number + */ + struct ib_port_immutable *port_immutable; int num_comp_vectors; struct iw_cm_verbs *iwcm; - int (*get_protocol_stats)(struct ib_device *device, - union rdma_protocol_stats *stats); + /** + * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the + * driver initialized data. The struct is kfree()'ed by the sysfs + * core when the device is removed. A lifespan of -1 in the return + * struct tells the core to set a default lifespan. + */ + struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device, + u8 port_num); + /** + * get_hw_stats - Fill in the counter value(s) in the stats struct. + * @index - The index in the value array we wish to have updated, or + * num_counters if we want all stats updated + * Return codes - + * < 0 - Error, no counters updated + * index - Updated the single counter pointed to by index + * num_counters - Updated all counters (will reset the timestamp + * and prevent further calls for lifespan milliseconds) + * Drivers are allowed to update all counters in leiu of just the + * one given in index at their option + */ + int (*get_hw_stats)(struct ib_device *device, + struct rdma_hw_stats *stats, + u8 port, int index); int (*query_device)(struct ib_device *device, - struct ib_device_attr *device_attr); + struct ib_device_attr *device_attr, + struct ib_udata *udata); int (*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u8 port_num); /* When calling get_netdev, the HW vendor's driver should return the - * net device of device @device at port @port_num. The function - * is called in rtnl_lock. The HW vendor's device driver must guarantee - * to return NULL before the net device has reached + * net device of device @device at port @port_num or NULL if such + * a net device doesn't exist. The vendor driver should call dev_hold + * on this net device. The HW vendor's device driver must guarantee + * that this function returns NULL before the net device reaches * NETDEV_UNREGISTER_FINAL state. */ - struct net_device *(*get_netdev)(struct ib_device *device, - u8 port_num); + struct net_device *(*get_netdev)(struct ib_device *device, + u8 port_num); int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); + /* When calling add_gid, the HW vendor's driver should + * add the gid of device @device at gid index @index of + * port @port_num to be @gid. Meta-info of that gid (for example, + * the network device related to this gid is available + * at @attr. @context allows the HW vendor driver to store extra + * information together with a GID entry. The HW vendor may allocate + * memory to contain this information and store it in @context when a + * new GID entry is written to. Params are consistent until the next + * call of add_gid or delete_gid. The function should return 0 on + * success or error otherwise. The function could be called + * concurrently for different ports. This function is only called + * when roce_gid_table is used. + */ + int (*add_gid)(struct ib_device *device, + u8 port_num, + unsigned int index, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + void **context); + /* When calling del_gid, the HW vendor's driver should delete the + * gid of device @device at gid index @index of port @port_num. + * Upon the deletion of a GID entry, the HW vendor must free any + * allocated memory. The caller will clear @context afterwards. + * This function is only called when roce_gid_table is used. + */ + int (*del_gid)(struct ib_device *device, + u8 port_num, + unsigned int index, + void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int (*modify_device)(struct ib_device *device, @@ -1722,12 +1971,11 @@ struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); struct ib_cq * (*create_cq)(struct ib_device *device, - struct ib_cq_init_attr *attr, + const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); - int (*modify_cq)(struct ib_cq *cq, - struct ib_cq_attr *cq_attr, - int cq_attr_mask); + int (*modify_cq)(struct ib_cq *cq, u16 cq_count, + u16 cq_period); int (*destroy_cq)(struct ib_cq *cq); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); @@ -1740,40 +1988,29 @@ int wc_cnt); struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); - struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, - struct ib_udata *udata, - int mr_id); - int (*query_mr)(struct ib_mr *mr, - struct ib_mr_attr *mr_attr); - int (*dereg_mr)(struct ib_mr *mr); - int (*destroy_mr)(struct ib_mr *mr); - struct ib_mr * (*create_mr)(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr); - struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd, - int max_page_list_len); - struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, - int page_list_len); - void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list); - int (*rereg_phys_mr)(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, + struct ib_udata *udata); + int (*rereg_user_mr)(struct ib_mr *mr, + int flags, + u64 start, u64 length, + u64 virt_addr, int mr_access_flags, - u64 *iova_start); + struct ib_pd *pd, + struct ib_udata *udata); + int (*dereg_mr)(struct ib_mr *mr); + struct ib_mr * (*alloc_mr)(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); + int (*map_mr_sg)(struct ib_mr *mr, + struct scatterlist *sg, + int sg_nents, + unsigned int *sg_offset); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, - enum ib_mw_type type); - int (*bind_mw)(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind); + enum ib_mw_type type, + struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, @@ -1792,10 +2029,13 @@ int (*process_mad)(struct ib_device *device, int process_mad_flags, u8 port_num, - struct ib_wc *in_wc, - struct ib_grh *in_grh, - struct ib_mad *in_mad, - struct ib_mad *out_mad); + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, + size_t in_mad_size, + struct ib_mad_hdr *out_mad, + size_t *out_mad_size, + u16 *out_mad_pkey_index); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, struct ib_ucontext *ucontext, struct ib_udata *udata); @@ -1807,17 +2047,29 @@ int (*destroy_flow)(struct ib_flow *flow_id); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); - - unsigned long (*get_unmapped_area)(struct file *file, - unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); - int (*ioctl)(struct ib_ucontext *context, - unsigned int cmd, - unsigned long arg); - int (*query_values)(struct ib_device *device, - int q_values, - struct ib_device_values *values); + void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); + void (*drain_rq)(struct ib_qp *qp); + void (*drain_sq)(struct ib_qp *qp); + int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port, + int state); + int (*get_vf_config)(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *ivf); + int (*get_vf_stats)(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats); + int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid, + int type); + struct ib_wq * (*create_wq)(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); + int (*destroy_wq)(struct ib_wq *wq); + int (*modify_wq)(struct ib_wq *wq, + struct ib_wq_attr *attr, + u32 wq_attr_mask, + struct ib_udata *udata); + struct ib_rwq_ind_table * (*create_rwq_ind_table)(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); + int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dma_mapping_ops *dma_ops; struct module *owner; @@ -1835,44 +2087,61 @@ u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; - char node_desc[64]; + char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; + u16 is_switch:1; u8 node_type; u8 phys_port_cnt; - int cmd_perf; - u64 cmd_avg; - u32 cmd_n; - spinlock_t cmd_perf_lock; - - /* - * Experimental data and functions + struct ib_device_attr attrs; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; + + /** + * The following mandatory functions are used only at device + * registration. Keep functions such as these at the end of this + * structure to avoid cache line misses when accessing struct ib_device + * in fast paths. */ - int (*exp_query_device)(struct ib_device *device, - struct ib_exp_device_attr *device_attr); - struct ib_qp * (*exp_create_qp)(struct ib_pd *pd, - struct ib_exp_qp_init_attr *qp_init_attr, - struct ib_udata *udata); - struct ib_dct * (*exp_create_dct)(struct ib_pd *pd, - struct ib_dct_init_attr *attr, - struct ib_udata *udata); - int (*exp_destroy_dct)(struct ib_dct *dct); - int (*exp_query_dct)(struct ib_dct *dct, struct ib_dct_attr *attr); - - u64 uverbs_exp_cmd_mask; + int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); + void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); }; struct ib_client { char *name; void (*add) (struct ib_device *); - void (*remove)(struct ib_device *); - + void (*remove)(struct ib_device *, void *client_data); + + /* Returns the net_dev belonging to this ib_client and matching the + * given parameters. + * @dev: An RDMA device that the net_dev use for communication. + * @port: A physical port number on the RDMA device. + * @pkey: P_Key that the net_dev uses if applicable. + * @gid: A GID that the net_dev uses to communicate. + * @addr: An IP address the net_dev is configured with. + * @client_data: The device's client data set by ib_set_client_data(). + * + * An ib_client that implements a net_dev on top of RDMA devices + * (such as IP over IB) should implement this callback, allowing the + * rdma_cm module to find the right net_dev for a given request. + * + * The caller is responsible for calling dev_put on the returned + * netdev. */ + struct net_device *(*get_net_dev_by_params)( + struct ib_device *dev, + u8 port, + u16 pkey, + const union ib_gid *gid, + const struct sockaddr *addr, + void *client_data); struct list_head list; }; struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); +void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len); + int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); @@ -1887,12 +2156,32 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { - return udata->ops->copy_from(dest, udata, len); + return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { - return udata->ops->copy_to(udata, src, len); + return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; +} + +static inline bool ib_is_udata_cleared(struct ib_udata *udata, + size_t offset, + size_t len) +{ + const void __user *p = (const char __user *)udata->inbuf + offset; + bool ret; + u8 *buf; + + if (len > USHRT_MAX) + return false; + + buf = memdup_user(p, len); + if (IS_ERR(buf)) + return false; + + ret = !memchr_inv(buf, 0, len); + kfree(buf); + return ret; } /** @@ -1919,17 +2208,330 @@ int ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr); - int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); +/** + * rdma_cap_ib_switch - Check if the device is IB switch + * @device: Device to check + * + * Device driver is responsible for setting is_switch bit on + * in ib_device structure at init time. + * + * Return: true if the device is IB switch. + */ +static inline bool rdma_cap_ib_switch(const struct ib_device *device) +{ + return device->is_switch; +} + +/** + * rdma_start_port - Return the first valid port number for the device + * specified + * + * @device: Device to be checked + * + * Return start port number + */ +static inline u8 rdma_start_port(const struct ib_device *device) +{ + return rdma_cap_ib_switch(device) ? 0 : 1; +} + +/** + * rdma_end_port - Return the last valid port number for the device + * specified + * + * @device: Device to be checked + * + * Return last port number + */ +static inline u8 rdma_end_port(const struct ib_device *device) +{ + return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; +} + +static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; +} + +static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & + (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); +} + +static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; +} + +static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; +} + +static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; +} + +static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) +{ + return rdma_protocol_ib(device, port_num) || + rdma_protocol_roce(device, port_num); +} + +/** + * rdma_cap_ib_mad - Check if the port of a device supports Infiniband + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Management Datagrams (MAD) are a required part of the InfiniBand + * specification and are supported on all InfiniBand devices. A slightly + * extended version are also supported on OPA interfaces. + * + * Return: true if the port supports sending/receiving of MAD packets. + */ +static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; +} + +/** + * rdma_cap_opa_mad - Check if the port of device provides support for OPA + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Intel OmniPath devices extend and/or replace the InfiniBand Management + * datagrams with their own versions. These OPA MADs share many but not all of + * the characteristics of InfiniBand MADs. + * + * OPA MADs differ in the following ways: + * + * 1) MADs are variable size up to 2K + * IBTA defined MADs remain fixed at 256 bytes + * 2) OPA SMPs must carry valid PKeys + * 3) OPA SMP packets are a different format + * + * Return: true if the port supports OPA MAD packet formats. + */ +static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) +{ + return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) + == RDMA_CORE_CAP_OPA_MAD; +} + +/** + * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband + * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). + * @device: Device to check + * @port_num: Port number to check + * + * Each InfiniBand node is required to provide a Subnet Management Agent + * that the subnet manager can access. Prior to the fabric being fully + * configured by the subnet manager, the SMA is accessed via a well known + * interface called the Subnet Management Interface (SMI). This interface + * uses directed route packets to communicate with the SM to get around the + * chicken and egg problem of the SM needing to know what's on the fabric + * in order to configure the fabric, and needing to configure the fabric in + * order to send packets to the devices on the fabric. These directed + * route packets do not need the fabric fully configured in order to reach + * their destination. The SMI is the only method allowed to send + * directed route packets on an InfiniBand fabric. + * + * Return: true if the port provides an SMI. + */ +static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; +} + +/** + * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * The InfiniBand Communication Manager is one of many pre-defined General + * Service Agents (GSA) that are accessed via the General Service + * Interface (GSI). It's role is to facilitate establishment of connections + * between nodes as well as other management related tasks for established + * connections. + * + * Return: true if the port supports an IB CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; +} + +/** + * rdma_cap_iw_cm - Check if the port of device has the capability IWARP + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * Similar to above, but specific to iWARP connections which have a different + * managment protocol than InfiniBand. + * + * Return: true if the port supports an iWARP CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; +} + +/** + * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband + * Subnet Administration. + * @device: Device to check + * @port_num: Port number to check + * + * An InfiniBand Subnet Administration (SA) service is a pre-defined General + * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand + * fabrics, devices should resolve routes to other hosts by contacting the + * SA to query the proper route. + * + * Return: true if the port should act as a client to the fabric Subnet + * Administration interface. This does not imply that the SA service is + * running locally. + */ +static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; +} + +/** + * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband + * Multicast. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand multicast registration is more complex than normal IPv4 or + * IPv6 multicast registration. Each Host Channel Adapter must register + * with the Subnet Manager when it wishes to join a multicast group. It + * should do so only once regardless of how many queue pairs it subscribes + * to this group. And it should leave the group only after all queue pairs + * attached to the group have been detached. + * + * Return: true if the port must undertake the additional adminstrative + * overhead of registering/unregistering with the SM and tracking of the + * total number of queue pairs attached to the multicast group. + */ +static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num) +{ + return rdma_cap_ib_sa(device, port_num); +} + +/** + * rdma_cap_af_ib - Check if the port of device has the capability + * Native Infiniband Address. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default + * GID. RoCE uses a different mechanism, but still generates a GID via + * a prescribed mechanism and port specific data. + * + * Return: true if the port uses a GID address to identify devices on the + * network. + */ +static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; +} + +/** + * rdma_cap_eth_ah - Check if the port of device has the capability + * Ethernet Address Handle. + * @device: Device to check + * @port_num: Port number to check + * + * RoCE is InfiniBand over Ethernet, and it uses a well defined technique + * to fabricate GIDs over Ethernet/IP specific addresses native to the + * port. Normally, packet headers are generated by the sending host + * adapter, but when sending connectionless datagrams, we must manually + * inject the proper headers for the fabric we are communicating over. + * + * Return: true if we are running as a RoCE port and must force the + * addition of a Global Route Header built from our Ethernet Address + * Handle into our header list for connectionless packets. + */ +static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; +} + +/** + * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. + * + * @device: Device + * @port_num: Port number + * + * This MAD size includes the MAD headers and MAD payload. No other headers + * are included. + * + * Return the max MAD size required by the Port. Will return 0 if the port + * does not support MADs + */ +static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].max_mad_size; +} + +/** + * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table + * @device: Device to check + * @port_num: Port number to check + * + * RoCE GID table mechanism manages the various GIDs for a device. + * + * NOTE: if allocating the port's GID table has failed, this call will still + * return true, but any RoCE GID table API will fail. + * + * Return: true if the port uses RoCE GID table mechanism in order to manage + * its GIDs. + */ +static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, + u8 port_num) +{ + return rdma_protocol_roce(device, port_num) && + device->add_gid && device->del_gid; +} + +/* + * Check if the device supports READ W/ INVALIDATE. + */ +static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) +{ + /* + * iWarp drivers must support READ W/ INVALIDATE. No other protocol + * has support for it yet. + */ + return rdma_protocol_iwarp(dev, port_num); +} + int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid); + u8 port_num, int index, union ib_gid *gid, + struct ib_gid_attr *attr); + +int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, + int state); +int ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info); +int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats); +int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, + int type); int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); @@ -1943,25 +2545,30 @@ struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); -/** - * ib_alloc_pd - Allocates an unused protection domain. - * @device: The device on which to allocate the protection domain. - * - * A protection domain object provides an association between QPs, shared - * receive queues, address handles, memory regions, and memory windows. - */ -struct ib_pd *ib_alloc_pd(struct ib_device *device); +enum ib_pd_flags { + /* + * Create a memory registration for all memory in the system and place + * the rkey for it into pd->unsafe_global_rkey. This can be used by + * ULPs to avoid the overhead of dynamic MRs. + * + * This flag is generally considered unsafe and must only be used in + * extremly trusted environments. Every use of it will log a warning + * in the kernel log. + */ + IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, +}; -/** - * ib_dealloc_pd - Deallocates a protection domain. - * @pd: The protection domain to deallocate. - */ -int ib_dealloc_pd(struct ib_pd *pd); +struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, + const char *caller); +#define ib_alloc_pd(device, flags) \ + __ib_alloc_pd((device), (flags), __func__) +void ib_dealloc_pd(struct ib_pd *pd); /** * ib_create_ah - Creates an address handle for the given address vector. @@ -1984,8 +2591,9 @@ * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, - struct ib_grh *grh, struct ib_ah_attr *ah_attr); +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct ib_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the @@ -1999,8 +2607,8 @@ * The address handle is used to reference a local or global destination * in all UD QP post sends. */ -struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, - struct ib_grh *grh, u8 port_num); +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, + const struct ib_grh *grh, u8 port_num); /** * ib_modify_ah - Modifies the address vector associated with an address @@ -2187,6 +2795,10 @@ return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } +struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx); +void ib_free_cq(struct ib_cq *cq); + /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. @@ -2196,16 +2808,15 @@ * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. - * @cqe: The minimum size of the CQ. - * @comp_vector - Completion vector used to signal completion events. - * Must be >= 0 and < context->num_comp_vectors. + * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), - void *cq_context, int cqe, int comp_vector); + void *cq_context, + const struct ib_cq_init_attr *cq_attr); /** * ib_resize_cq - Modifies the capacity of the CQ. @@ -2217,16 +2828,13 @@ int ib_resize_cq(struct ib_cq *cq, int cqe); /** - * ib_modify_cq - Modifies the attributes for the specified CQ and then - * transitions the CQ to the given state. + * ib_modify_cq - Modifies moderation params of the CQ * @cq: The CQ to modify. - * @cq_attr: specifies the CQ attributes to modify. - * @cq_attr_mask: A bit-mask used to specify which attributes of the CQ - * are being modified. + * @cq_count: number of CQEs that will trigger an event + * @cq_period: max period of time in usec before triggering an event + * */ -int ib_modify_cq(struct ib_cq *cq, - struct ib_cq_attr *cq_attr, - int cq_attr_mask); +int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq - Destroys the specified CQ. @@ -2312,18 +2920,6 @@ } /** - * ib_get_dma_mr - Returns a memory region for system memory that is - * usable for DMA. - * @pd: The protection domain associated with the memory region. - * @mr_access_flags: Specifies the memory access rights. - * - * Note that the ib_dma_*() functions defined below must be used - * to create/destroy addresses used with the Lkey or Rkey returned - * by ib_get_dma_mr(). - */ -struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags); - -/** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check @@ -2371,19 +2967,19 @@ static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { return dma_map_single_attrs(dev->dma_device, cpu_addr, size, - direction, attrs); + direction, dma_attrs); } static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { return dma_unmap_single_attrs(dev->dma_device, addr, size, - direction, attrs); + direction, dma_attrs); } /** @@ -2458,28 +3054,39 @@ static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { - return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs); + if (dev->dma_ops) + return dev->dma_ops->map_sg_attrs(dev, sg, nents, direction, + dma_attrs); + else + return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, + dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *dma_attrs) { - dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs); + if (dev->dma_ops) + return dev->dma_ops->unmap_sg_attrs(dev, sg, nents, direction, + dma_attrs); + else + dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, + dma_attrs); } /** * ib_sg_dma_address - Return the DMA address from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry + * + * Note: this function is obsolete. To do: change all occurrences of + * ib_sg_dma_address() into sg_dma_address(). */ static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg) { - if (dev->dma_ops) - return dev->dma_ops->dma_address(dev, sg); return sg_dma_address(sg); } @@ -2487,12 +3094,13 @@ * ib_sg_dma_len - Return the DMA length from a scatter/gather entry * @dev: The device for which the DMA addresses were created * @sg: The scatter/gather entry + * + * Note: this function is obsolete. To do: change all occurrences of + * ib_sg_dma_len() into sg_dma_len(). */ static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg) { - if (dev->dma_ops) - return dev->dma_ops->dma_len(dev, sg); return sg_dma_len(sg); } @@ -2574,59 +3182,6 @@ } /** - * ib_reg_phys_mr - Prepares a virtually addressed memory region for use - * by an HCA. - * @pd: The protection domain associated assigned to the registered region. - * @phys_buf_array: Specifies a list of physical buffers to use in the - * memory region. - * @num_phys_buf: Specifies the size of the phys_buf_array. - * @mr_access_flags: Specifies the memory access rights. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** - * ib_rereg_phys_mr - Modifies the attributes of an existing memory region. - * Conceptually, this call performs the functions deregister memory region - * followed by register physical memory region. Where possible, - * resources are reused instead of deallocated and reallocated. - * @mr: The memory region to modify. - * @mr_rereg_mask: A bit-mask used to indicate which of the following - * properties of the memory region are being modified. - * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies - * the new protection domain to associated with the memory region, - * otherwise, this parameter is ignored. - * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies a list of physical buffers to use in the new - * translation, otherwise, this parameter is ignored. - * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies the size of the phys_buf_array, otherwise, this - * parameter is ignored. - * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this - * field specifies the new memory access rights, otherwise, this - * parameter is ignored. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** - * ib_query_mr - Retrieves information about a specific memory region. - * @mr: The memory region to retrieve information about. - * @mr_attr: The attributes of the specified memory region. - */ -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); - -/** * ib_dereg_mr - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. @@ -2635,60 +3190,9 @@ */ int ib_dereg_mr(struct ib_mr *mr); - -/** - * ib_create_mr - Allocates a memory region that may be used for - * signature handover operations. - * @pd: The protection domain associated with the region. - * @mr_init_attr: memory region init attributes. - */ -struct ib_mr *ib_create_mr(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr); - -/** - * ib_destroy_mr - Destroys a memory region that was created using - * ib_create_mr and removes it from HW translation tables. - * @mr: The memory region to destroy. - * - * This function can fail, if the memory region has memory windows bound to it. - */ -int ib_destroy_mr(struct ib_mr *mr); - -/** - * ib_alloc_fast_reg_mr - Allocates memory region usable with the - * IB_WR_FAST_REG_MR send work request. - * @pd: The protection domain associated with the region. - * @max_page_list_len: requested max physical buffer list length to be - * used with fast register work requests for this MR. - */ -struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); - -/** - * ib_alloc_fast_reg_page_list - Allocates a page list array - * @device - ib device pointer. - * @page_list_len - size of the page list array to be allocated. - * - * This allocates and returns a struct ib_fast_reg_page_list * and a - * page_list array that is at least page_list_len in size. The actual - * size is returned in max_page_list_len. The caller is responsible - * for initializing the contents of the page_list array before posting - * a send work request with the IB_WC_FAST_REG_MR opcode. - * - * The page_list array entries must be translated using one of the - * ib_dma_*() functions just like the addresses passed to - * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct - * ib_fast_reg_page_list must not be modified by the caller until the - * IB_WC_FAST_REG_MR work request completes. - */ -struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list( - struct ib_device *device, int page_list_len); - -/** - * ib_free_fast_reg_page_list - Deallocates a previously allocated - * page list array. - * @page_list - struct ib_fast_reg_page_list pointer to be deallocated. - */ -void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR @@ -2714,42 +3218,6 @@ } /** - * ib_alloc_mw - Allocates a memory window. - * @pd: The protection domain associated with the memory window. - * @type: The type of the memory window (1 or 2). - */ -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); - -/** - * ib_bind_mw - Posts a work request to the send queue of the specified - * QP, which binds the memory window to the given address range and - * remote access attributes. - * @qp: QP to post the bind work request on. - * @mw: The memory window to bind. - * @mw_bind: Specifies information about the memory window, including - * its address range, remote access rights, and associated memory region. - * - * If there is no immediate error, the function will update the rkey member - * of the mw parameter to its new value. The bind operation can still fail - * asynchronously. - */ -static inline int ib_bind_mw(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind) -{ - /* XXX reference counting in corresponding MR? */ - return mw->device->bind_mw ? - mw->device->bind_mw(qp, mw, mw_bind) : - -ENOSYS; -} - -/** - * ib_dealloc_mw - Deallocates a memory window. - * @mw: The memory window to deallocate. - */ -int ib_dealloc_mw(struct ib_mw *mw); - -/** * ib_alloc_fmr - Allocates a unmapped fast memory region. * @pd: The protection domain associated with the unmapped region. * @mr_access_flags: Specifies the memory access rights. @@ -2826,47 +3294,6 @@ struct ib_flow_attr *flow_attr, int domain); int ib_destroy_flow(struct ib_flow *flow_id); -struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr, - struct ib_udata *udata); -int ib_destroy_dct(struct ib_dct *dct); -int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr); - -int ib_query_values(struct ib_device *device, - int q_values, struct ib_device_values *values); - -static inline void ib_active_speed_enum_to_rate(u8 active_speed, - int *rate, - char **speed) -{ - switch (active_speed) { - case IB_SPEED_DDR: - *speed = " DDR"; - *rate = 50; - break; - case IB_SPEED_QDR: - *speed = " QDR"; - *rate = 100; - break; - case IB_SPEED_FDR10: - *speed = " FDR10"; - *rate = 100; - break; - case IB_SPEED_FDR: - *speed = " FDR"; - *rate = 140; - break; - case IB_SPEED_EDR: - *speed = " EDR"; - *rate = 250; - break; - case IB_SPEED_SDR: - default: /* default to SDR for invalid rates */ - *rate = 25; - break; - } - -} - static inline int ib_check_mr_access(int flags) { /* @@ -2895,4 +3322,38 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, + u16 pkey, const union ib_gid *gid, + const struct sockaddr *addr); +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr); +int ib_destroy_wq(struct ib_wq *wq); +int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr, + u32 wq_attr_mask); +struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr* + wq_ind_table_init_attr); +int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); + +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size); + +static inline int +ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) +{ + int n; + + n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); + mr->iova = 0; + + return n; +} + +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); + +void ib_drain_rq(struct ib_qp *qp); +void ib_drain_sq(struct ib_qp *qp); +void ib_drain_qp(struct ib_qp *qp); #endif /* IB_VERBS_H */ Index: sys/ofed/include/rdma/ib_verbs_exp.h =================================================================== --- sys/ofed/include/rdma/ib_verbs_exp.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2004 Infinicon Corporation. All rights reserved. - * Copyright (c) 2004 Intel Corporation. All rights reserved. - * Copyright (c) 2004 Topspin Corporation. All rights reserved. - * Copyright (c) 2004 Voltaire Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IB_VERBS_EXP_H -#define IB_VERBS_EXP_H - -#include - - -enum ib_exp_device_cap_flags2 { - IB_EXP_DEVICE_DC_TRANSPORT = 1 << 0, - IB_EXP_DEVICE_QPG = 1 << 1, - IB_EXP_DEVICE_UD_RSS = 1 << 2, - IB_EXP_DEVICE_UD_TSS = 1 << 3 -}; - -enum ib_exp_device_attr_comp_mask { - IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1, - IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2, - IB_EXP_DEVICE_ATTR_CAP_FLAGS2 = 1ULL << 3, - IB_EXP_DEVICE_ATTR_DC_REQ_RD = 1ULL << 4, - IB_EXP_DEVICE_ATTR_DC_RES_RD = 1ULL << 5, - IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ = 1ULL << 6, - IB_EXP_DEVICE_ATTR_RSS_TBL_SZ = 1ULL << 7, -}; - -struct ib_exp_device_attr { - struct ib_device_attr base; - /* Use IB_EXP_DEVICE_ATTR_... for exp_comp_mask */ - uint32_t exp_comp_mask; - uint64_t device_cap_flags2; - uint32_t dc_rd_req; - uint32_t dc_rd_res; - uint32_t inline_recv_sz; - uint32_t max_rss_tbl_sz; -}; - -struct ib_exp_qp_init_attr { - void (*event_handler)(struct ib_event *, void *); - void *qp_context; - struct ib_cq *send_cq; - struct ib_cq *recv_cq; - struct ib_srq *srq; - struct ib_xrcd *xrcd; /* XRC TGT QPs only */ - struct ib_qp_cap cap; - union { - struct ib_qp *qpg_parent; /* see qpg_type */ - struct ib_qpg_init_attrib parent_attrib; - }; - enum ib_sig_type sq_sig_type; - enum ib_qp_type qp_type; - enum ib_qp_create_flags create_flags; - enum ib_qpg_type qpg_type; - u8 port_num; /* special QP types only */ - u32 max_inl_recv; -}; - - -int ib_exp_query_device(struct ib_device *device, - struct ib_exp_device_attr *device_attr); - - - - -#endif /* IB_VERBS_EXP_H */ Index: sys/ofed/include/rdma/iw_cm.h =================================================================== --- sys/ofed/include/rdma/iw_cm.h +++ sys/ofed/include/rdma/iw_cm.h @@ -1,7 +1,6 @@ /* * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -50,12 +49,11 @@ struct iw_cm_event { enum iw_cm_event_type event; int status; - struct sockaddr_in local_addr; - struct sockaddr_in remote_addr; + struct sockaddr_storage local_addr; + struct sockaddr_storage remote_addr; void *private_data; void *provider_data; u8 private_data_len; - struct socket *so; u8 ord; u8 ird; }; @@ -85,15 +83,17 @@ iw_cm_handler cm_handler; /* client callback function */ void *context; /* client cb context */ struct ib_device *device; - struct sockaddr_in local_addr; - struct sockaddr_in remote_addr; + struct sockaddr_storage local_addr; /* local addr */ + struct sockaddr_storage remote_addr; + struct sockaddr_storage m_local_addr; /* nmapped local addr */ + struct sockaddr_storage m_remote_addr; /* nmapped rem addr */ void *provider_data; /* provider private data */ iw_event_handler event_handler; /* cb for provider events */ /* Used by provider to add and remove refs on IW cm_id */ void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); - struct socket *so; + u8 tos; }; struct iw_cm_conn_param { @@ -121,13 +121,11 @@ int (*reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); - int (*create_listen_ep)(struct iw_cm_id *cm_id, + int (*create_listen)(struct iw_cm_id *cm_id, int backlog); - void (*destroy_listen_ep)(struct iw_cm_id *cm_id); - - void (*newconn)(struct iw_cm_id *parent_cm_id, - struct socket *so); + int (*destroy_listen)(struct iw_cm_id *cm_id); + char ifname[IFNAMSIZ]; }; /** @@ -138,7 +136,7 @@ * returned IW CM identifier. * @context: User specified context associated with the id. */ -struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so, +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, iw_cm_handler cm_handler, void *context); /** Index: sys/ofed/include/rdma/iw_portmap.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/iw_portmap.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IW_PORTMAP_H +#define _IW_PORTMAP_H + +#define IWPM_ULIBNAME_SIZE 32 +#define IWPM_DEVNAME_SIZE 32 +#define IWPM_IFNAME_SIZE 16 +#define IWPM_IPADDR_SIZE 16 + +enum { + IWPM_INVALID_NLMSG_ERR = 10, + IWPM_CREATE_MAPPING_ERR, + IWPM_DUPLICATE_MAPPING_ERR, + IWPM_UNKNOWN_MAPPING_ERR, + IWPM_CLIENT_DEV_INFO_ERR, + IWPM_USER_LIB_INFO_ERR, + IWPM_REMOTE_QUERY_REJECT +}; + +struct iwpm_dev_data { + char dev_name[IWPM_DEVNAME_SIZE]; + char if_name[IWPM_IFNAME_SIZE]; +}; + +struct iwpm_sa_data { + struct sockaddr_storage loc_addr; + struct sockaddr_storage mapped_loc_addr; + struct sockaddr_storage rem_addr; + struct sockaddr_storage mapped_rem_addr; +}; + +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ +int iwpm_valid_pid(void); + +#endif /* _IW_PORTMAP_H */ Index: sys/ofed/include/rdma/opa_port_info.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/opa_port_info.h @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(OPA_PORT_INFO_H) +#define OPA_PORT_INFO_H + +#define OPA_PORT_LINK_MODE_NOP 0 /* No change */ +#define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */ + +#define OPA_PORT_PACKET_FORMAT_NOP 0 /* No change */ +#define OPA_PORT_PACKET_FORMAT_8B 1 /* Format 8B */ +#define OPA_PORT_PACKET_FORMAT_9B 2 /* Format 9B */ +#define OPA_PORT_PACKET_FORMAT_10B 4 /* Format 10B */ +#define OPA_PORT_PACKET_FORMAT_16B 8 /* Format 16B */ + +#define OPA_PORT_LTP_CRC_MODE_NONE 0 /* No change */ +#define OPA_PORT_LTP_CRC_MODE_14 1 /* 14-bit LTP CRC mode (optional) */ +#define OPA_PORT_LTP_CRC_MODE_16 2 /* 16-bit LTP CRC mode */ +#define OPA_PORT_LTP_CRC_MODE_48 4 /* 48-bit LTP CRC mode (optional) */ +#define OPA_PORT_LTP_CRC_MODE_PER_LANE 8 /* 12/16-bit per lane LTP CRC mode */ + +/* Link Down / Neighbor Link Down Reason; indicated as follows: */ +#define OPA_LINKDOWN_REASON_NONE 0 /* No specified reason */ +#define OPA_LINKDOWN_REASON_RCV_ERROR_0 1 +#define OPA_LINKDOWN_REASON_BAD_PKT_LEN 2 +#define OPA_LINKDOWN_REASON_PKT_TOO_LONG 3 +#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT 4 +#define OPA_LINKDOWN_REASON_BAD_SLID 5 +#define OPA_LINKDOWN_REASON_BAD_DLID 6 +#define OPA_LINKDOWN_REASON_BAD_L2 7 +#define OPA_LINKDOWN_REASON_BAD_SC 8 +#define OPA_LINKDOWN_REASON_RCV_ERROR_8 9 +#define OPA_LINKDOWN_REASON_BAD_MID_TAIL 10 +#define OPA_LINKDOWN_REASON_RCV_ERROR_10 11 +#define OPA_LINKDOWN_REASON_PREEMPT_ERROR 12 +#define OPA_LINKDOWN_REASON_PREEMPT_VL15 13 +#define OPA_LINKDOWN_REASON_BAD_VL_MARKER 14 +#define OPA_LINKDOWN_REASON_RCV_ERROR_14 15 +#define OPA_LINKDOWN_REASON_RCV_ERROR_15 16 +#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST 17 +#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST 18 +#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST 19 +#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK 20 +#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER 21 +#define OPA_LINKDOWN_REASON_BAD_PREEMPT 22 +#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT 23 +#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT 24 +#define OPA_LINKDOWN_REASON_RCV_ERROR_24 25 +#define OPA_LINKDOWN_REASON_RCV_ERROR_25 26 +#define OPA_LINKDOWN_REASON_RCV_ERROR_26 27 +#define OPA_LINKDOWN_REASON_RCV_ERROR_27 28 +#define OPA_LINKDOWN_REASON_RCV_ERROR_28 29 +#define OPA_LINKDOWN_REASON_RCV_ERROR_29 30 +#define OPA_LINKDOWN_REASON_RCV_ERROR_30 31 +#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN 32 +#define OPA_LINKDOWN_REASON_UNKNOWN 33 +/* 34 -reserved */ +#define OPA_LINKDOWN_REASON_REBOOT 35 +#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN 36 +/* 37-38 reserved */ +#define OPA_LINKDOWN_REASON_FM_BOUNCE 39 +#define OPA_LINKDOWN_REASON_SPEED_POLICY 40 +#define OPA_LINKDOWN_REASON_WIDTH_POLICY 41 +/* 42-48 reserved */ +#define OPA_LINKDOWN_REASON_DISCONNECTED 49 +#define OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED 50 +#define OPA_LINKDOWN_REASON_NOT_INSTALLED 51 +#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG 52 +/* 53 reserved */ +#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED 54 +/* 55 reserved */ +#define OPA_LINKDOWN_REASON_POWER_POLICY 56 +#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY 57 +#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY 58 +/* 59 reserved */ +#define OPA_LINKDOWN_REASON_SWITCH_MGMT 60 +#define OPA_LINKDOWN_REASON_SMA_DISABLED 61 +/* 62 reserved */ +#define OPA_LINKDOWN_REASON_TRANSIENT 63 +/* 64-255 reserved */ + +/* OPA Link Init reason; indicated as follows: */ +/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */ +#define OPA_LINKINIT_REASON_NOP 0 +#define OPA_LINKINIT_REASON_LINKUP (1 << 4) +#define OPA_LINKINIT_REASON_FLAPPING (2 << 4) +#define OPA_LINKINIT_REASON_CLEAR (8 << 4) +#define OPA_LINKINIT_OUTSIDE_POLICY (8 << 4) +#define OPA_LINKINIT_QUARANTINED (9 << 4) +#define OPA_LINKINIT_INSUFIC_CAPABILITY (10 << 4) + +#define OPA_LINK_SPEED_NOP 0x0000 /* Reserved (1-5 Gbps) */ +#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */ +#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125? Gbps (EDR) */ + +#define OPA_LINK_WIDTH_1X 0x0001 +#define OPA_LINK_WIDTH_2X 0x0002 +#define OPA_LINK_WIDTH_3X 0x0004 +#define OPA_LINK_WIDTH_4X 0x0008 + +#define OPA_CAP_MASK3_IsSnoopSupported (1 << 7) +#define OPA_CAP_MASK3_IsAsyncSC2VLSupported (1 << 6) +#define OPA_CAP_MASK3_IsAddrRangeConfigSupported (1 << 5) +#define OPA_CAP_MASK3_IsPassThroughSupported (1 << 4) +#define OPA_CAP_MASK3_IsSharedSpaceSupported (1 << 3) +/* reserved (1 << 2) */ +#define OPA_CAP_MASK3_IsVLMarkerSupported (1 << 1) +#define OPA_CAP_MASK3_IsVLrSupported (1 << 0) + +/** + * new MTU values + */ +enum { + OPA_MTU_8192 = 6, + OPA_MTU_10240 = 7, +}; + +enum { + OPA_PORT_PHYS_CONF_DISCONNECTED = 0, + OPA_PORT_PHYS_CONF_STANDARD = 1, + OPA_PORT_PHYS_CONF_FIXED = 2, + OPA_PORT_PHYS_CONF_VARIABLE = 3, + OPA_PORT_PHYS_CONF_SI_PHOTO = 4 +}; + +enum port_info_field_masks { + /* vl.cap */ + OPA_PI_MASK_VL_CAP = 0x1F, + /* port_states.ledenable_offlinereason */ + OPA_PI_MASK_OFFLINE_REASON = 0x0F, + OPA_PI_MASK_LED_ENABLE = 0x40, + /* port_states.unsleepstate_downdefstate */ + OPA_PI_MASK_UNSLEEP_STATE = 0xF0, + OPA_PI_MASK_DOWNDEF_STATE = 0x0F, + /* port_states.portphysstate_portstate */ + OPA_PI_MASK_PORT_PHYSICAL_STATE = 0xF0, + OPA_PI_MASK_PORT_STATE = 0x0F, + /* port_phys_conf */ + OPA_PI_MASK_PORT_PHYSICAL_CONF = 0x0F, + /* collectivemask_multicastmask */ + OPA_PI_MASK_COLLECT_MASK = 0x38, + OPA_PI_MASK_MULTICAST_MASK = 0x07, + /* mkeyprotect_lmc */ + OPA_PI_MASK_MKEY_PROT_BIT = 0xC0, + OPA_PI_MASK_LMC = 0x0F, + /* smsl */ + OPA_PI_MASK_SMSL = 0x1F, + /* partenforce_filterraw */ + /* Filter Raw In/Out bits 1 and 2 were removed */ + OPA_PI_MASK_LINKINIT_REASON = 0xF0, + OPA_PI_MASK_PARTITION_ENFORCE_IN = 0x08, + OPA_PI_MASK_PARTITION_ENFORCE_OUT = 0x04, + /* operational_vls */ + OPA_PI_MASK_OPERATIONAL_VL = 0x1F, + /* sa_qp */ + OPA_PI_MASK_SA_QP = 0x00FFFFFF, + /* sm_trap_qp */ + OPA_PI_MASK_SM_TRAP_QP = 0x00FFFFFF, + /* localphy_overrun_errors */ + OPA_PI_MASK_LOCAL_PHY_ERRORS = 0xF0, + OPA_PI_MASK_OVERRUN_ERRORS = 0x0F, + /* clientrereg_subnettimeout */ + OPA_PI_MASK_CLIENT_REREGISTER = 0x80, + OPA_PI_MASK_SUBNET_TIMEOUT = 0x1F, + /* port_link_mode */ + OPA_PI_MASK_PORT_LINK_SUPPORTED = (0x001F << 10), + OPA_PI_MASK_PORT_LINK_ENABLED = (0x001F << 5), + OPA_PI_MASK_PORT_LINK_ACTIVE = (0x001F << 0), + /* port_link_crc_mode */ + OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED = 0x0F00, + OPA_PI_MASK_PORT_LINK_CRC_ENABLED = 0x00F0, + OPA_PI_MASK_PORT_LINK_CRC_ACTIVE = 0x000F, + /* port_mode */ + OPA_PI_MASK_PORT_MODE_SECURITY_CHECK = 0x0001, + OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY = 0x0002, + OPA_PI_MASK_PORT_MODE_PKEY_CONVERT = 0x0004, + OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING = 0x0008, + OPA_PI_MASK_PORT_MODE_VL_MARKER = 0x0010, + OPA_PI_MASK_PORT_PASS_THROUGH = 0x0020, + OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE = 0x0040, + /* flit_control.interleave */ + OPA_PI_MASK_INTERLEAVE_DIST_SUP = (0x0003 << 12), + OPA_PI_MASK_INTERLEAVE_DIST_ENABLE = (0x0003 << 10), + OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX = (0x001F << 5), + OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX = (0x001F << 0), + + /* port_error_action */ + OPA_PI_MASK_EX_BUFFER_OVERRUN = 0x80000000, + /* 7 bits reserved */ + OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT = 0x00800000, + OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT = 0x00400000, + OPA_PI_MASK_FM_CFG_BAD_PREEMPT = 0x00200000, + OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER = 0x00100000, + OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK = 0x00080000, + OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST = 0x00040000, + OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST = 0x00020000, + OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST = 0x00010000, + /* 2 bits reserved */ + OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER = 0x00002000, + OPA_PI_MASK_PORT_RCV_PREEMPT_VL15 = 0x00001000, + OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR = 0x00000800, + /* 1 bit reserved */ + OPA_PI_MASK_PORT_RCV_BAD_MidTail = 0x00000200, + /* 1 bit reserved */ + OPA_PI_MASK_PORT_RCV_BAD_SC = 0x00000080, + OPA_PI_MASK_PORT_RCV_BAD_L2 = 0x00000040, + OPA_PI_MASK_PORT_RCV_BAD_DLID = 0x00000020, + OPA_PI_MASK_PORT_RCV_BAD_SLID = 0x00000010, + OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT = 0x00000008, + OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG = 0x00000004, + OPA_PI_MASK_PORT_RCV_BAD_PKTLEN = 0x00000002, + OPA_PI_MASK_PORT_RCV_BAD_LT = 0x00000001, + + /* pass_through.res_drctl */ + OPA_PI_MASK_PASS_THROUGH_DR_CONTROL = 0x01, + + /* buffer_units */ + OPA_PI_MASK_BUF_UNIT_VL15_INIT = (0x00000FFF << 11), + OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE = (0x0000001F << 6), + OPA_PI_MASK_BUF_UNIT_CREDIT_ACK = (0x00000003 << 3), + OPA_PI_MASK_BUF_UNIT_BUF_ALLOC = (0x00000003 << 0), + + /* neigh_mtu.pvlx_to_mtu */ + OPA_PI_MASK_NEIGH_MTU_PVL0 = 0xF0, + OPA_PI_MASK_NEIGH_MTU_PVL1 = 0x0F, + + /* neigh_mtu.vlstall_hoq_life */ + OPA_PI_MASK_VL_STALL = (0x03 << 5), + OPA_PI_MASK_HOQ_LIFE = (0x1F << 0), + + /* port_neigh_mode */ + OPA_PI_MASK_NEIGH_MGMT_ALLOWED = (0x01 << 3), + OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS = (0x01 << 2), + OPA_PI_MASK_NEIGH_NODE_TYPE = (0x03 << 0), + + /* resptime_value */ + OPA_PI_MASK_RESPONSE_TIME_VALUE = 0x1F, + + /* mtucap */ + OPA_PI_MASK_MTU_CAP = 0x0F, +}; + +struct opa_port_states { + u8 reserved; + u8 ledenable_offlinereason; /* 1 res, 1 bit, 6 bits */ + u8 reserved2; + u8 portphysstate_portstate; /* 4 bits, 4 bits */ +}; + +struct opa_port_state_info { + struct opa_port_states port_states; + __be16 link_width_downgrade_tx_active; + __be16 link_width_downgrade_rx_active; +}; + +struct opa_port_info { + __be32 lid; + __be32 flow_control_mask; + + struct { + u8 res; /* was inittype */ + u8 cap; /* 3 res, 5 bits */ + __be16 high_limit; + __be16 preempt_limit; + u8 arb_high_cap; + u8 arb_low_cap; + } vl; + + struct opa_port_states port_states; + u8 port_phys_conf; /* 4 res, 4 bits */ + u8 collectivemask_multicastmask; /* 2 res, 3, 3 */ + u8 mkeyprotect_lmc; /* 2 bits, 2 res, 4 bits */ + u8 smsl; /* 3 res, 5 bits */ + + u8 partenforce_filterraw; /* bit fields */ + u8 operational_vls; /* 3 res, 5 bits */ + __be16 pkey_8b; + __be16 pkey_10b; + __be16 mkey_violations; + + __be16 pkey_violations; + __be16 qkey_violations; + __be32 sm_trap_qp; /* 8 bits, 24 bits */ + + __be32 sa_qp; /* 8 bits, 24 bits */ + u8 neigh_port_num; + u8 link_down_reason; + u8 neigh_link_down_reason; + u8 clientrereg_subnettimeout; /* 1 bit, 2 bits, 5 */ + + struct { + __be16 supported; + __be16 enabled; + __be16 active; + } link_speed; + struct { + __be16 supported; + __be16 enabled; + __be16 active; + } link_width; + struct { + __be16 supported; + __be16 enabled; + __be16 tx_active; + __be16 rx_active; + } link_width_downgrade; + __be16 port_link_mode; /* 1 res, 5 bits, 5 bits, 5 bits */ + __be16 port_ltp_crc_mode; /* 4 res, 4 bits, 4 bits, 4 bits */ + + __be16 port_mode; /* 9 res, bit fields */ + struct { + __be16 supported; + __be16 enabled; + } port_packet_format; + struct { + __be16 interleave; /* 2 res, 2,2,5,5 */ + struct { + __be16 min_initial; + __be16 min_tail; + u8 large_pkt_limit; + u8 small_pkt_limit; + u8 max_small_pkt_limit; + u8 preemption_limit; + } preemption; + } flit_control; + + __be32 reserved4; + __be32 port_error_action; /* bit field */ + + struct { + u8 egress_port; + u8 res_drctl; /* 7 res, 1 */ + } pass_through; + __be16 mkey_lease_period; + __be32 buffer_units; /* 9 res, 12, 5, 3, 3 */ + + __be32 reserved5; + __be32 sm_lid; + + __be64 mkey; + + __be64 subnet_prefix; + + struct { + u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */ + } neigh_mtu; + + struct { + u8 vlstall_hoqlife; /* 3 bits, 5 bits */ + } xmit_q[OPA_MAX_VLS]; + + struct { + u8 addr[16]; + } ipaddr_ipv6; + + struct { + u8 addr[4]; + } ipaddr_ipv4; + + u32 reserved6; + u32 reserved7; + u32 reserved8; + + __be64 neigh_node_guid; + + __be32 ib_cap_mask; + __be16 reserved9; /* was ib_cap_mask2 */ + __be16 opa_cap_mask; + + __be32 reserved10; /* was link_roundtrip_latency */ + __be16 overall_buffer_space; + __be16 reserved11; /* was max_credit_hint */ + + __be16 diag_code; + struct { + u8 buffer; + u8 wire; + } replay_depth; + u8 port_neigh_mode; + u8 mtucap; /* 4 res, 4 bits */ + + u8 resptimevalue; /* 3 res, 5 bits */ + u8 local_port_num; + u8 reserved12; + u8 reserved13; /* was guid_cap */ +} __attribute__ ((packed)); + +#endif /* OPA_PORT_INFO_H */ Index: sys/ofed/include/rdma/opa_smi.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/opa_smi.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(OPA_SMI_H) +#define OPA_SMI_H + +#include +#include + +#define OPA_SMP_LID_DATA_SIZE 2016 +#define OPA_SMP_DR_DATA_SIZE 1872 +#define OPA_SMP_MAX_PATH_HOPS 64 + +#define OPA_MAX_VLS 32 +#define OPA_MAX_SLS 32 +#define OPA_MAX_SCS 32 + +#define OPA_SMI_CLASS_VERSION 0x80 + +#define OPA_LID_PERMISSIVE cpu_to_be32(0xFFFFFFFF) + +struct opa_smp { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + u8 hop_ptr; + u8 hop_cnt; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 mkey; + union { + struct { + uint8_t data[OPA_SMP_LID_DATA_SIZE]; + } lid; + struct { + __be32 dr_slid; + __be32 dr_dlid; + u8 initial_path[OPA_SMP_MAX_PATH_HOPS]; + u8 return_path[OPA_SMP_MAX_PATH_HOPS]; + u8 reserved[8]; + u8 data[OPA_SMP_DR_DATA_SIZE]; + } dr; + } route; +} __packed; + + +/* Subnet management attributes */ +/* ... */ +#define OPA_ATTRIB_ID_NODE_DESCRIPTION cpu_to_be16(0x0010) +#define OPA_ATTRIB_ID_NODE_INFO cpu_to_be16(0x0011) +#define OPA_ATTRIB_ID_PORT_INFO cpu_to_be16(0x0015) +#define OPA_ATTRIB_ID_PARTITION_TABLE cpu_to_be16(0x0016) +#define OPA_ATTRIB_ID_SL_TO_SC_MAP cpu_to_be16(0x0017) +#define OPA_ATTRIB_ID_VL_ARBITRATION cpu_to_be16(0x0018) +#define OPA_ATTRIB_ID_SM_INFO cpu_to_be16(0x0020) +#define OPA_ATTRIB_ID_CABLE_INFO cpu_to_be16(0x0032) +#define OPA_ATTRIB_ID_AGGREGATE cpu_to_be16(0x0080) +#define OPA_ATTRIB_ID_SC_TO_SL_MAP cpu_to_be16(0x0082) +#define OPA_ATTRIB_ID_SC_TO_VLR_MAP cpu_to_be16(0x0083) +#define OPA_ATTRIB_ID_SC_TO_VLT_MAP cpu_to_be16(0x0084) +#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP cpu_to_be16(0x0085) +/* ... */ +#define OPA_ATTRIB_ID_PORT_STATE_INFO cpu_to_be16(0x0087) +/* ... */ +#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE cpu_to_be16(0x008A) +/* ... */ + +struct opa_node_description { + u8 data[64]; +} __attribute__ ((packed)); + +struct opa_node_info { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be32 reserved; + __be64 system_image_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; /* network byte order */ +} __attribute__ ((packed)); + +#define OPA_PARTITION_TABLE_BLK_SIZE 32 + +static inline u8 +opa_get_smp_direction(const struct opa_smp *smp) +{ + return ib_get_smp_direction((const struct ib_smp *)smp); +} + +static inline u8 *opa_get_smp_data(struct opa_smp *smp) +{ + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return smp->route.dr.data; + + return smp->route.lid.data; +} + +static inline size_t opa_get_smp_data_size(const struct opa_smp *smp) +{ + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return sizeof(smp->route.dr.data); + + return sizeof(smp->route.lid.data); +} + +static inline size_t opa_get_smp_header_size(const struct opa_smp *smp) +{ + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return sizeof(*smp) - sizeof(smp->route.dr.data); + + return sizeof(*smp) - sizeof(smp->route.lid.data); +} + +#endif /* OPA_SMI_H */ Index: sys/ofed/include/rdma/peer_mem.h =================================================================== --- sys/ofed/include/rdma/peer_mem.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2013, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if !defined(PEER_MEM_H) -#define PEER_MEM_H - -#include -#include -#include -#include -#include - - -#define IB_PEER_MEMORY_NAME_MAX 64 -#define IB_PEER_MEMORY_VER_MAX 16 - -struct peer_memory_client { - char name[IB_PEER_MEMORY_NAME_MAX]; - char version[IB_PEER_MEMORY_VER_MAX]; - /* acquire return code: 1 mine, 0 - not mine */ - int (*acquire) (unsigned long addr, size_t size, void *peer_mem_private_data, - char *peer_mem_name, void **client_context); - int (*get_pages) (unsigned long addr, - size_t size, int write, int force, - struct sg_table *sg_head, - void *client_context, void *core_context); - int (*dma_map) (struct sg_table *sg_head, void *client_context, - struct device *dma_device, int dmasync, int *nmap); - int (*dma_unmap) (struct sg_table *sg_head, void *client_context, - struct device *dma_device); - void (*put_pages) (struct sg_table *sg_head, void *client_context); - unsigned long (*get_page_size) (void *client_context); - void (*release) (void *client_context); - -}; - -typedef int (*invalidate_peer_memory)(void *reg_handle, - void *core_context); - -void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, - invalidate_peer_memory *invalidate_callback); -void ib_unregister_peer_memory_client(void *reg_handle); - -#endif Index: sys/ofed/include/rdma/rdma_cm.h =================================================================== --- sys/ofed/include/rdma/rdma_cm.h +++ sys/ofed/include/rdma/rdma_cm.h @@ -1,7 +1,6 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -60,13 +59,11 @@ RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, - RDMA_CM_EVENT_TIMEWAIT_EXIT, - RDMA_CM_EVENT_ALT_ROUTE_RESOLVED, - RDMA_CM_EVENT_ALT_ROUTE_ERROR, - RDMA_CM_EVENT_LOAD_ALT_PATH, - RDMA_CM_EVENT_ALT_PATH_LOADED, + RDMA_CM_EVENT_TIMEWAIT_EXIT }; +const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event); + enum rdma_port_space { RDMA_PS_SDP = 0x0001, RDMA_PS_IPOIB = 0x0002, @@ -75,12 +72,10 @@ RDMA_PS_UDP = 0x0111, }; -enum alt_path_type { - RDMA_ALT_PATH_NONE, - RDMA_ALT_PATH_PORT, - RDMA_ALT_PATH_LID, - RDMA_ALT_PATH_BEST -}; +#define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL +#define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL +#define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL +#define RDMA_IB_IP_PS_IB 0x00000000013F0000ULL struct rdma_addr { struct sockaddr_storage src_addr; @@ -105,6 +100,7 @@ /* Fields below ignored if a QP is created on the rdma_cm_id. */ u8 srq; u32 qp_num; + u32 qkey; }; struct rdma_ud_param { @@ -113,7 +109,6 @@ struct ib_ah_attr ah_attr; u32 qp_num; u32 qkey; - u8 alt_path_index; }; struct rdma_cm_event { @@ -160,19 +155,22 @@ enum rdma_port_space ps; enum ib_qp_type qp_type; u8 port_num; - void *ucontext; }; /** * rdma_create_id - Create an RDMA identifier. * + * @net: The network namespace in which to create the new id. * @event_handler: User callback invoked to report events associated with the * returned rdma_id. * @context: User specified context associated with the id. * @ps: RDMA port space. * @qp_type: type of queue pair associated with the id. + * + * The id holds a reference on the network namespace until it is destroyed. */ -struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, +struct rdma_cm_id *rdma_create_id(struct vnet *net, + rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, enum ib_qp_type qp_type); @@ -223,19 +221,6 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); /** - * rdma_enable_apm - Get ready to use APM for the given ID. - * Actual Alternate path discovery and load will take place only - * after a connection has been established. - * - * Calling this function only has an effect on the connection's client side. - * It should be called after rdma_resolve_route and before rdma_connect. - * - * @id: RDMA identifier. - * @alt_type: Alternate path type to resolve. - */ -int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type); - -/** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA * identifier. * @@ -348,11 +333,13 @@ * address. * @id: Communication identifier associated with the request. * @addr: Multicast address identifying the group to join. + * @join_state: Multicast JoinState bitmap requested by port. + * Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits. * @context: User-defined context associated with the join request, returned * to the user through the private_data pointer in multicast events. */ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, - void *context); + u8 join_state, void *context); /** * rdma_leave_multicast - Leave the multicast group specified by the given @@ -394,14 +381,11 @@ */ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); -/** - * rdma_set_timeout - Set the QP timeout associated with a connection - * identifier. - * @id: Communication identifier to associated with service type. - * @timeout: QP timeout + /** + * rdma_get_service_id - Return the IB service ID for a specified address. + * @id: Communication identifier associated with the address. + * @addr: Address for the service ID. */ -void rdma_set_timeout(struct rdma_cm_id *id, int timeout); -int rdma_cma_any_addr(struct sockaddr *addr); -int rdma_find_cmid_laddr(struct sockaddr_in *local_addr, - unsigned short dev_type, void **cm_id); +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr); + #endif /* RDMA_CM_H */ Index: sys/ofed/include/rdma/rdma_vt.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/rdma_vt.h @@ -0,0 +1,500 @@ +#ifndef DEF_RDMA_VT_H +#define DEF_RDMA_VT_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * Structure that low level drivers will populate in order to register with the + * rdmavt layer. + */ + +#include +#include +#include +#include +#include +#include + +#define RVT_MAX_PKEY_VALUES 16 + +struct rvt_ibport { + struct rvt_qp __rcu *qp[2]; + struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ + struct rb_root mcast_tree; + spinlock_t lock; /* protect changes in this struct */ + + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + unsigned long trap_timeout; + __be64 gid_prefix; /* in network order */ + __be64 mkey; + u64 tid; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 mkey_lease_period; + u16 sm_lid; + u8 sm_sl; + u8 mkeyprot; + u8 subnet_timeout; + u8 vl_high_limit; + + /* + * Driver is expected to keep these up to date. These + * counters are informational only and not required to be + * completely accurate. + */ + u64 n_rc_resends; + u64 n_seq_naks; + u64 n_rdma_seq; + u64 n_rnr_naks; + u64 n_other_naks; + u64 n_loop_pkts; + u64 n_pkt_drops; + u64 n_vl15_dropped; + u64 n_rc_timeouts; + u64 n_dmawait; + u64 n_unaligned; + u64 n_rc_dupreq; + u64 n_rc_seqnak; + u16 pkey_violations; + u16 qkey_violations; + u16 mkey_violations; + + /* Hot-path per CPU counters to avoid cacheline trading to update */ + u64 z_rc_acks; + u64 z_rc_qacks; + u64 z_rc_delayed_comp; + u64 __percpu *rc_acks; + u64 __percpu *rc_qacks; + u64 __percpu *rc_delayed_comp; + + void *priv; /* driver private data */ + + /* + * The pkey table is allocated and maintained by the driver. Drivers + * need to have access to this before registering with rdmav. However + * rdmavt will need access to it so drivers need to proviee this during + * the attach port API call. + */ + u16 *pkey_table; + + struct rvt_ah *sm_ah; +}; + +#define RVT_CQN_MAX 16 /* maximum length of cq name */ + +/* + * Things that are driver specific, module parameters in hfi1 and qib + */ +struct rvt_driver_params { + struct ib_device_attr props; + + /* + * Anything driver specific that is not covered by props + * For instance special module parameters. Goes here. + */ + unsigned int lkey_table_size; + unsigned int qp_table_size; + int qpn_start; + int qpn_inc; + int qpn_res_start; + int qpn_res_end; + int nports; + int npkeys; + char cq_name[RVT_CQN_MAX]; + int node; + int psn_mask; + int psn_shift; + int psn_modify_mask; + u32 core_cap_flags; + u32 max_mad_size; + u8 qos_shift; + u8 max_rdma_atomic; + u8 reserved_operations; +}; + +/* Protection domain */ +struct rvt_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address handle */ +struct rvt_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; + atomic_t refcount; + u8 vl; + u8 log_pmtu; +}; + +struct rvt_dev_info; +struct rvt_swqe; +struct rvt_driver_provided { + /* + * Which functions are required depends on which verbs rdmavt is + * providing and which verbs the driver is overriding. See + * check_support() for details. + */ + + /* Passed to ib core registration. Callback to create syfs files */ + int (*port_callback)(struct ib_device *, u8, struct kobject *); + + /* + * Returns a string to represent the device for which is being + * registered. This is primarily used for error and debug messages on + * the console. + */ + const char * (*get_card_name)(struct rvt_dev_info *rdi); + + /* + * Returns a pointer to the undelying hardware's PCI device. This is + * used to display information as to what hardware is being referenced + * in an output message + */ + struct pci_dev * (*get_pci_dev)(struct rvt_dev_info *rdi); + + /* + * Allocate a private queue pair data structure for driver specific + * information which is opaque to rdmavt. Errors are returned via + * ERR_PTR(err). The driver is free to return NULL or a valid + * pointer. + */ + void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + gfp_t gfp); + + /* + * Free the driver's private qp structure. + */ + void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp); + + /* + * Inform the driver the particular qp in quesiton has been reset so + * that it can clean up anything it needs to. + */ + void (*notify_qp_reset)(struct rvt_qp *qp); + + /* + * Give the driver a notice that there is send work to do. It is up to + * the driver to generally push the packets out, this just queues the + * work with the driver. There are two variants here. The no_lock + * version requires the s_lock not to be held. The other assumes the + * s_lock is held. + */ + void (*schedule_send)(struct rvt_qp *qp); + void (*schedule_send_no_lock)(struct rvt_qp *qp); + + /* + * Sometimes rdmavt needs to kick the driver's send progress. That is + * done by this call back. + */ + void (*do_send)(struct rvt_qp *qp); + + /* + * Get a path mtu from the driver based on qp attributes. + */ + int (*get_pmtu_from_attr)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr); + + /* + * Notify driver that it needs to flush any outstanding IO requests that + * are waiting on a qp. + */ + void (*flush_qp_waiters)(struct rvt_qp *qp); + + /* + * Notify driver to stop its queue of sending packets. Nothing else + * should be posted to the queue pair after this has been called. + */ + void (*stop_send_queue)(struct rvt_qp *qp); + + /* + * Have the drivr drain any in progress operations + */ + void (*quiesce_qp)(struct rvt_qp *qp); + + /* + * Inform the driver a qp has went to error state. + */ + void (*notify_error_qp)(struct rvt_qp *qp); + + /* + * Get an MTU for a qp. + */ + u32 (*mtu_from_qp)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + u32 pmtu); + /* + * Convert an mtu to a path mtu + */ + int (*mtu_to_path_mtu)(u32 mtu); + + /* + * Get the guid of a port in big endian byte order + */ + int (*get_guid_be)(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, + int guid_index, __be64 *guid); + + /* + * Query driver for the state of the port. + */ + int (*query_port_state)(struct rvt_dev_info *rdi, u8 port_num, + struct ib_port_attr *props); + + /* + * Tell driver to shutdown a port + */ + int (*shut_down_port)(struct rvt_dev_info *rdi, u8 port_num); + + /* Tell driver to send a trap for changed port capabilities */ + void (*cap_mask_chg)(struct rvt_dev_info *rdi, u8 port_num); + + /* + * The following functions can be safely ignored completely. Any use of + * these is checked for NULL before blindly calling. Rdmavt should also + * be functional if drivers omit these. + */ + + /* Called to inform the driver that all qps should now be freed. */ + unsigned (*free_all_qps)(struct rvt_dev_info *rdi); + + /* Driver specific AH validation */ + int (*check_ah)(struct ib_device *, struct ib_ah_attr *); + + /* Inform the driver a new AH has been created */ + void (*notify_new_ah)(struct ib_device *, struct ib_ah_attr *, + struct rvt_ah *); + + /* Let the driver pick the next queue pair number*/ + int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, + enum ib_qp_type type, u8 port_num, gfp_t gfp); + + /* Determine if its safe or allowed to modify the qp */ + int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + + /* Driver specific QP modification/notification-of */ + void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + + /* Driver specific work request checking */ + int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + + /* Notify driver a mad agent has been created */ + void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx); + + /* Notify driver a mad agent has been removed */ + void (*notify_free_mad_agent)(struct rvt_dev_info *rdi, int port_idx); + +}; + +struct rvt_dev_info { + struct ib_device ibdev; /* Keep this first. Nothing above here */ + + /* + * Prior to calling for registration the driver will be responsible for + * allocating space for this structure. + * + * The driver will also be responsible for filling in certain members of + * dparms.props. The driver needs to fill in dparms exactly as it would + * want values reported to a ULP. This will be returned to the caller + * in rdmavt's device. The driver should also therefore refrain from + * modifying this directly after registration with rdmavt. + */ + + /* Driver specific properties */ + struct rvt_driver_params dparms; + + /* post send table */ + const struct rvt_operation_params *post_parms; + + struct rvt_mregion __rcu *dma_mr; + struct rvt_lkey_table lkey_table; + + /* Driver specific helper functions */ + struct rvt_driver_provided driver_f; + + /* Internal use */ + int n_pds_allocated; + spinlock_t n_pds_lock; /* Protect pd allocated count */ + + int n_ahs_allocated; + spinlock_t n_ahs_lock; /* Protect ah allocated count */ + + u32 n_srqs_allocated; + spinlock_t n_srqs_lock; /* Protect srqs allocated count */ + + int flags; + struct rvt_ibport **ports; + + /* QP */ + struct rvt_qp_ibdev *qp_dev; + u32 n_qps_allocated; /* number of QPs allocated for device */ + u32 n_rc_qps; /* number of RC QPs allocated for device */ + u32 busy_jiffies; /* timeout scaling based on RC QP count */ + spinlock_t n_qps_lock; /* protect qps, rc qps and busy jiffy counts */ + + /* memory maps */ + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; /* protect mmap_offset */ + u32 mmap_offset; + spinlock_t pending_lock; /* protect pending mmap list */ + + /* CQ */ + struct kthread_worker *worker; /* per device cq worker */ + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; /* protect count of in use cqs */ + + /* Multicast */ + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; + +}; + +static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct rvt_pd, ibpd); +} + +static inline struct rvt_ah *ibah_to_rvtah(struct ib_ah *ibah) +{ + return container_of(ibah, struct rvt_ah, ibah); +} + +static inline struct rvt_dev_info *ib_to_rvt(struct ib_device *ibdev) +{ + return container_of(ibdev, struct rvt_dev_info, ibdev); +} + +static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct rvt_srq, ibsrq); +} + +static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct rvt_qp, ibqp); +} + +static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) +{ + /* + * All ports have same number of pkeys. + */ + return rdi->dparms.npkeys; +} + +/* + * Return the max atomic suitable for determining + * the size of the ack ring buffer in a QP. + */ +static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + 1; +} + +/* + * Return the indexed PKEY from the port PKEY table. + */ +static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi, + int port_index, + unsigned index) +{ + if (index >= rvt_get_npkeys(rdi)) + return 0; + else + return rdi->ports[port_index]->pkey_table[index]; +} + +/** + * rvt_lookup_qpn - return the QP with the given QPN + * @ibp: the ibport + * @qpn: the QP number to look up + * + * The caller must hold the rcu_read_lock(), and keep the lock until + * the returned qp is no longer in use. + */ +/* TODO: Remove this and put in rdmavt/qp.h when no longer needed by drivers */ +static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, + struct rvt_ibport *rvp, + u32 qpn) __must_hold(RCU) +{ + struct rvt_qp *qp = NULL; + + if (unlikely(qpn <= 1)) { + qp = rcu_dereference(rvp->qp[qpn]); + } else { + u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits); + + for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) + if (qp->ibqp.qp_num == qpn) + break; + } + return qp; +} + +struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); +void rvt_dealloc_device(struct rvt_dev_info *rdi); +int rvt_register_device(struct rvt_dev_info *rvd); +void rvt_unregister_device(struct rvt_dev_info *rvd); +int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); +int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, + int port_index, u16 *pkey_table); +int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, + int access); +int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey); +int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc); +int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, + struct rvt_sge *isge, struct ib_sge *sge, int acc); +struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid); + +#endif /* DEF_RDMA_VT_H */ Index: sys/ofed/include/rdma/rdmavt_cq.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/rdmavt_cq.h @@ -0,0 +1,99 @@ +#ifndef DEF_RDMAVT_INCCQ_H +#define DEF_RDMAVT_INCCQ_H + +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct rvt_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct rvt_cq { + struct ib_cq ibcq; + struct kthread_work comptask; + spinlock_t lock; /* protect changes in this struct */ + u8 notify; + u8 triggered; + struct rvt_dev_info *rdi; + struct rvt_cq_wc *queue; + struct rvt_mmap_info *ip; +}; + +static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct rvt_cq, ibcq); +} + +void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); + +#endif /* DEF_RDMAVT_INCCQH */ Index: sys/ofed/include/rdma/rdmavt_mr.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/rdmavt_mr.h @@ -0,0 +1,140 @@ +#ifndef DEF_RDMAVT_INCMR_H +#define DEF_RDMAVT_INCMR_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * For Memory Regions. This stuff should probably be moved into rdmavt/mr.h once + * drivers no longer need access to the MR directly. + */ + +/* + * A segment is a linear region of low physical memory. + * Used by the verbs layer. + */ +struct rvt_seg { + void *vaddr; + size_t length; +}; + +/* The number of rvt_segs that fit in a page. */ +#define RVT_SEGSZ (PAGE_SIZE / sizeof(struct rvt_seg)) + +struct rvt_segarray { + struct rvt_seg segs[RVT_SEGSZ]; +}; + +struct rvt_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of rvt_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + u8 page_shift; /* 0 - non unform/non powerof2 sizes */ + u8 lkey_published; /* in global table */ + atomic_t lkey_invalid; /* true if current lkey is invalid */ + struct completion comp; /* complete when refcount goes to zero */ + atomic_t refcount; + struct rvt_segarray *map[0]; /* the segments */ +}; + +#define RVT_MAX_LKEY_TABLE_BITS 23 + +struct rvt_lkey_table { + spinlock_t lock; /* protect changes in this struct */ + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct rvt_mregion __rcu **table; +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct rvt_sge { + struct rvt_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +struct rvt_sge_state { + struct rvt_sge *sg_list; /* next SGE to be used if any */ + struct rvt_sge sge; /* progress state for the current SGE */ + u32 total_len; + u8 num_sge; +}; + +static inline void rvt_put_mr(struct rvt_mregion *mr) +{ + if (unlikely(atomic_dec_and_test(&mr->refcount))) + complete(&mr->comp); +} + +static inline void rvt_get_mr(struct rvt_mregion *mr) +{ + atomic_inc(&mr->refcount); +} + +static inline void rvt_put_ss(struct rvt_sge_state *ss) +{ + while (ss->num_sge) { + rvt_put_mr(ss->sge.mr); + if (--ss->num_sge) + ss->sge = *ss->sg_list++; + } +} + +#endif /* DEF_RDMAVT_INCMRH */ Index: sys/ofed/include/rdma/rdmavt_qp.h =================================================================== --- /dev/null +++ sys/ofed/include/rdma/rdmavt_qp.h @@ -0,0 +1,535 @@ +#ifndef DEF_RDMAVT_INCQP_H +#define DEF_RDMAVT_INCQP_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +/* + * Atomic bit definitions for r_aflags. + */ +#define RVT_R_WRID_VALID 0 +#define RVT_R_REWIND_SGE 1 + +/* + * Bit definitions for r_flags. + */ +#define RVT_R_REUSE_SGE 0x01 +#define RVT_R_RDMAR_SEQ 0x02 +#define RVT_R_RSP_NAK 0x04 +#define RVT_R_RSP_SEND 0x08 +#define RVT_R_COMM_EST 0x10 + +/* + * Bit definitions for s_flags. + * + * RVT_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled + * RVT_S_BUSY - send tasklet is processing the QP + * RVT_S_TIMER - the RC retry timer is active + * RVT_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics + * RVT_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * RVT_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete + * before processing the next SWQE + * RVT_S_WAIT_RNR - waiting for RNR timeout + * RVT_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA + * RVT_S_WAIT_PIO - waiting for a send buffer to be available + * RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets + * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available + * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available + * RVT_S_WAIT_KMEM - waiting for kernel memory to be available + * RVT_S_WAIT_PSN - waiting for a packet to exit the send DMA queue + * RVT_S_WAIT_ACK - waiting for an ACK packet before sending more requests + * RVT_S_SEND_ONE - send one packet, request ACK, then wait for ACK + * RVT_S_ECN - a BECN was queued to the send engine + */ +#define RVT_S_SIGNAL_REQ_WR 0x0001 +#define RVT_S_BUSY 0x0002 +#define RVT_S_TIMER 0x0004 +#define RVT_S_RESP_PENDING 0x0008 +#define RVT_S_ACK_PENDING 0x0010 +#define RVT_S_WAIT_FENCE 0x0020 +#define RVT_S_WAIT_RDMAR 0x0040 +#define RVT_S_WAIT_RNR 0x0080 +#define RVT_S_WAIT_SSN_CREDIT 0x0100 +#define RVT_S_WAIT_DMA 0x0200 +#define RVT_S_WAIT_PIO 0x0400 +#define RVT_S_WAIT_PIO_DRAIN 0x0800 +#define RVT_S_WAIT_TX 0x1000 +#define RVT_S_WAIT_DMA_DESC 0x2000 +#define RVT_S_WAIT_KMEM 0x4000 +#define RVT_S_WAIT_PSN 0x8000 +#define RVT_S_WAIT_ACK 0x10000 +#define RVT_S_SEND_ONE 0x20000 +#define RVT_S_UNLIMITED_CREDIT 0x40000 +#define RVT_S_AHG_VALID 0x80000 +#define RVT_S_AHG_CLEAR 0x100000 +#define RVT_S_ECN 0x200000 + +/* + * Wait flags that would prevent any packet type from being sent. + */ +#define RVT_S_ANY_WAIT_IO \ + (RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \ + RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM) + +/* + * Wait flags that would prevent send work requests from making progress. + */ +#define RVT_S_ANY_WAIT_SEND (RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | \ + RVT_S_WAIT_RNR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA | \ + RVT_S_WAIT_PSN | RVT_S_WAIT_ACK) + +#define RVT_S_ANY_WAIT (RVT_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) + +/* Number of bits to pay attention to in the opcode for checking qp type */ +#define RVT_OPCODE_QP_MASK 0xE0 + +/* Flags for checking QP state (see ib_rvt_state_ops[]) */ +#define RVT_POST_SEND_OK 0x01 +#define RVT_POST_RECV_OK 0x02 +#define RVT_PROCESS_RECV_OK 0x04 +#define RVT_PROCESS_SEND_OK 0x08 +#define RVT_PROCESS_NEXT_SEND_OK 0x10 +#define RVT_FLUSH_SEND 0x20 +#define RVT_FLUSH_RECV 0x40 +#define RVT_PROCESS_OR_FLUSH_SEND \ + (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND) + +/* + * Internal send flags + */ +#define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START +#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct rvt_swqe { + union { + struct ib_send_wr wr; /* don't use wr.sg_list */ + struct ib_ud_wr ud_wr; + struct ib_reg_wr reg_wr; + struct ib_rdma_wr rdma_wr; + struct ib_atomic_wr atomic_wr; + }; + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct rvt_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct rvt_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct rvt_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct rvt_rwqe wq[0]; +}; + +struct rvt_rq { + struct rvt_rwq *wq; + u32 size; /* size of RWQE array */ + u8 max_sge; + /* protect changes in this struct */ + spinlock_t lock ____cacheline_aligned_in_smp; +}; + +/* + * This structure is used by rvt_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct rvt_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct rvt_ack_entry { + struct rvt_sge rdma_sge; + u64 atomic_data; + u32 psn; + u32 lpsn; + u8 opcode; + u8 sent; +}; + +#define RC_QP_SCALING_INTERVAL 5 + +#define RVT_OPERATION_PRIV 0x00000001 +#define RVT_OPERATION_ATOMIC 0x00000002 +#define RVT_OPERATION_ATOMIC_SGE 0x00000004 +#define RVT_OPERATION_LOCAL 0x00000008 +#define RVT_OPERATION_USE_RESERVE 0x00000010 + +#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) + +/** + * rvt_operation_params - op table entry + * @length - the length to copy into the swqe entry + * @qpt_support - a bit mask indicating QP type support + * @flags - RVT_OPERATION flags (see above) + * + * This supports table driven post send so that + * the driver can have differing an potentially + * different sets of operations. + * + **/ + +struct rvt_operation_params { + size_t length; + u32 qpt_support; + u32 flags; +}; + +/* + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct rvt_qp { + struct ib_qp ibqp; + void *priv; /* Driver private data */ + /* read mostly fields above and below */ + struct ib_ah_attr remote_ah_attr; + struct ib_ah_attr alt_ah_attr; + struct rvt_qp __rcu *next; /* link list for QPN hash table */ + struct rvt_swqe *s_wq; /* send work queue */ + struct rvt_mmap_info *ip; + + unsigned long timeout_jiffies; /* computed from timeout */ + + enum ib_mtu path_mtu; + int srate_mbps; /* s_srate (below) converted to Mbit/s */ + pid_t pid; /* pid for user mode QPs */ + u32 remote_qpn; + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_ahgpsn; /* set to the psn in the copy of the header */ + + u16 pmtu; /* decoded from path_mtu */ + u8 log_pmtu; /* shift for pmtu */ + u8 state; /* QP state */ + u8 allowed_ops; /* high order bits of allowed opcodes */ + u8 qp_access_flags; + u8 alt_timeout; /* Alternate path timeout for this QP */ + u8 timeout; /* Timeout for this QP */ + u8 s_srate; + u8 s_mig_state; + u8 port_num; + u8 s_pkey_index; /* PKEY index to use */ + u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_draining; + + /* start of read/write fields */ + atomic_t refcount ____cacheline_aligned_in_smp; + wait_queue_head_t wait; + + struct rvt_ack_entry *s_ack_queue; + struct rvt_sge_state s_rdma_read_sge; + + spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ + u32 r_psn; /* expected rcv packet sequence number */ + unsigned long r_aflags; + u64 r_wr_id; /* ID for current receive WQE */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_msn; /* message sequence number */ + + u8 r_state; /* opcode of last packet received */ + u8 r_flags; + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + + struct list_head rspwait; /* link for waiting to respond */ + + struct rvt_sge_state r_sge; /* current receive data */ + struct rvt_rq r_rq; /* receive work queue */ + + /* post send line */ + spinlock_t s_hlock ____cacheline_aligned_in_smp; + u32 s_head; /* new entries added here */ + u32 s_next_psn; /* PSN for next request */ + u32 s_avail; /* number of entries avail */ + u32 s_ssn; /* SSN of tail entry */ + atomic_t s_reserved_used; /* reserved entries in use */ + + spinlock_t s_lock ____cacheline_aligned_in_smp; + u32 s_flags; + struct rvt_sge_state *s_cur_sge; + struct rvt_swqe *s_wqe; + struct rvt_sge_state s_sge; /* current send request data */ + struct rvt_mregion *s_rdma_mr; + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_sending_psn; /* lowest PSN that is being sent */ + u32 s_sending_hpsn; /* highest PSN that is being sent */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_acked; /* last un-ACK'ed entry */ + u32 s_last; /* last completed entry */ + u32 s_lsn; /* limit sequence number (credit) */ + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u16 s_rdma_ack_cnt; + s8 s_ahgidx; + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + + struct rvt_sge_state s_ack_rdma_sge; + struct timer_list s_timer; + + atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */ + + /* + * This sge list MUST be last. Do not add anything below here. + */ + struct rvt_sge r_sg_list[0] /* verified SGEs */ + ____cacheline_aligned_in_smp; +}; + +struct rvt_srq { + struct ib_srq ibsrq; + struct rvt_rq rq; + struct rvt_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +#define RVT_QPN_MAX BIT(24) +#define RVT_QPNMAP_ENTRIES (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) +#define RVT_BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) +#define RVT_BITS_PER_PAGE_MASK (RVT_BITS_PER_PAGE - 1) +#define RVT_QPN_MASK 0xFFFFFF + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct rvt_qpn_map { + void *page; +}; + +struct rvt_qpn_table { + spinlock_t lock; /* protect changes to the qp table */ + unsigned flags; /* flags for QP0/1 allocated for each port */ + u32 last; /* last QP number allocated */ + u32 nmaps; /* size of the map table */ + u16 limit; + u8 incr; + /* bit map of free QP numbers other than 0/1 */ + struct rvt_qpn_map map[RVT_QPNMAP_ENTRIES]; +}; + +struct rvt_qp_ibdev { + u32 qp_table_size; + u32 qp_table_bits; + struct rvt_qp __rcu **qp_table; + spinlock_t qpt_lock; /* qptable lock */ + struct rvt_qpn_table qpn_table; +}; + +/* + * There is one struct rvt_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct rvt_mcast_qp. + */ +struct rvt_mcast_qp { + struct list_head list; + struct rvt_qp *qp; +}; + +struct rvt_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* + * Since struct rvt_swqe is not a fixed size, we can't simply index into + * struct rvt_qp.s_wq. This function does the array index computation. + */ +static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp, + unsigned n) +{ + return (struct rvt_swqe *)((char *)qp->s_wq + + (sizeof(struct rvt_swqe) + + qp->s_max_sge * + sizeof(struct rvt_sge)) * n); +} + +/* + * Since struct rvt_rwqe is not a fixed size, we can't simply index into + * struct rvt_rwq.wq. This function does the array index computation. + */ +static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) +{ + return (struct rvt_rwqe *) + ((char *)rq->wq->wq + + (sizeof(struct rvt_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +/** + * rvt_get_qp - get a QP reference + * @qp - the QP to hold + */ +static inline void rvt_get_qp(struct rvt_qp *qp) +{ + atomic_inc(&qp->refcount); +} + +/** + * rvt_put_qp - release a QP reference + * @qp - the QP to release + */ +static inline void rvt_put_qp(struct rvt_qp *qp) +{ + if (qp && atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * rvt_qp_wqe_reserve - reserve operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This routine used in post send to record + * a wqe relative reserved operation use. + */ +static inline void rvt_qp_wqe_reserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; + atomic_inc(&qp->s_reserved_used); +} + +/** + * rvt_qp_wqe_unreserve - clean reserved operation + * @qp - the rvt qp + * @wqe - the send wqe + * + * This decrements the reserve use count. + * + * This call MUST precede the change to + * s_last to insure that post send sees a stable + * s_avail. + * + * An smp_mp__after_atomic() is used to insure + * the compiler does not juggle the order of the s_last + * ring index and the decrementing of s_reserved_used. + */ +static inline void rvt_qp_wqe_unreserve( + struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) { + wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED; + atomic_dec(&qp->s_reserved_used); + /* insure no compiler re-order up to s_last change */ + smp_mb__after_atomic(); + } +} + +extern const int ib_rvt_state_ops[]; + +struct rvt_dev_info; +int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); + +#endif /* DEF_RDMAVT_INCQP_H */ Index: sys/ofed/include/rdma/sdp_socket.h =================================================================== --- sys/ofed/include/rdma/sdp_socket.h +++ /dev/null @@ -1,23 +0,0 @@ -/* Stuff that should go into include/linux/socket.h */ - -#ifndef SDP_SOCKET_H -#define SDP_SOCKET_H - -#ifndef __FreeBSD__ -#ifndef AF_INET_SDP -#define AF_INET_SDP 27 -#define PF_INET_SDP AF_INET_SDP -#endif -#endif - -#ifndef SDP_ZCOPY_THRESH -#define SDP_ZCOPY_THRESH 80 -#endif - -#ifndef SDP_LAST_BIND_ERR -#define SDP_LAST_BIND_ERR 81 -#endif - -/* TODO: AF_INET6_SDP ? */ - -#endif Index: sys/ofed/include/uapi/rdma/ib_user_mad.h =================================================================== --- sys/ofed/include/uapi/rdma/ib_user_mad.h +++ sys/ofed/include/uapi/rdma/ib_user_mad.h @@ -191,12 +191,55 @@ __u8 rmpp_version; }; +/** + * ib_user_mad_reg_req2 - MAD registration request + * + * @id - Set by the _kernel_; used by userspace to identify the + * registered agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @mgmt_class - Indicates which management class of MADs should be + * receive by the caller. This field is only required if + * the user wishes to receive unsolicited MADs, otherwise + * it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @res - Ignored. + * @flags - additional registration flags; Must be in the set of + * flags defined in IB_USER_MAD_REG_FLAGS_CAP + * @method_mask - The caller wishes to receive unsolicited MADs for the + * methods whose bit(s) is(are) set. + * @oui - Indicates IEEE OUI to use when mgmt_class is a vendor + * class in the range from 0x30 to 0x4f. Otherwise not + * used. + * @rmpp_version - If set, indicates the RMPP version to use. + */ +enum { + IB_USER_MAD_USER_RMPP = (1 << 0), +}; +#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP) +struct ib_user_mad_reg_req2 { + __u32 id; + __u32 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u16 res; + __u32 flags; + __u64 method_mask[2]; + __u32 oui; + __u8 rmpp_version; + __u8 reserved[3]; +}; + #define IB_IOCTL_MAGIC 0x1b -#define IB_USER_MAD_REGISTER_AGENT _IO(IB_IOCTL_MAGIC, 1) +#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ + struct ib_user_mad_reg_req) -#define IB_USER_MAD_UNREGISTER_AGENT _IO(IB_IOCTL_MAGIC, 2) +#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32) #define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) +#define IB_USER_MAD_REGISTER_AGENT2 _IOWR(IB_IOCTL_MAGIC, 4, \ + struct ib_user_mad_reg_req2) + #endif /* IB_USER_MAD_H */ Index: sys/ofed/include/uapi/rdma/ib_user_verbs.h =================================================================== --- sys/ofed/include/uapi/rdma/ib_user_verbs.h +++ sys/ofed/include/uapi/rdma/ib_user_verbs.h @@ -45,12 +45,6 @@ #define IB_USER_VERBS_ABI_VERSION 6 #define IB_USER_VERBS_CMD_THRESHOLD 50 -/* - * To support 6 legacy commands using the old extension style - */ -#define IB_USER_VERBS_LEGACY_CMD_FIRST 52 -#define IB_USER_VERBS_LEGACY_EX_CMD_LAST 56 - enum { IB_USER_VERBS_CMD_GET_CONTEXT, IB_USER_VERBS_CMD_QUERY_DEVICE, @@ -92,15 +86,22 @@ IB_USER_VERBS_CMD_OPEN_XRCD, IB_USER_VERBS_CMD_CLOSE_XRCD, IB_USER_VERBS_CMD_CREATE_XSRQ, - IB_USER_VERBS_CMD_OPEN_QP + IB_USER_VERBS_CMD_OPEN_QP, }; enum { + IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, + IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ, + IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_EX_CMD_DESTROY_FLOW + IB_USER_VERBS_EX_CMD_DESTROY_FLOW, + IB_USER_VERBS_EX_CMD_CREATE_WQ, + IB_USER_VERBS_EX_CMD_MODIFY_WQ, + IB_USER_VERBS_EX_CMD_DESTROY_WQ, + IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL }; - /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to @@ -130,14 +131,6 @@ * the rest of the command struct based on these value. */ -#define IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, field) \ - ((ibv_type *)((void *)(ex_ptr) + offsetof(ex_type, \ - field) + sizeof((ex_ptr)->field))) - -#define IBV_RESP_TO_VERBS_RESP_EX(ex_ptr, ex_type, ibv_type) \ - IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, comp_mask) - - #define IB_USER_VERBS_CMD_COMMAND_MASK 0xff #define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u #define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 @@ -172,11 +165,6 @@ __u64 driver_data[0]; }; -struct ib_uverbs_query_device_ex { - __u64 comp_mask; - __u64 driver_data[0]; -}; - struct ib_uverbs_query_device_resp { __u64 fw_ver; __be64 node_guid; @@ -221,6 +209,45 @@ __u8 reserved[4]; }; +struct ib_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +struct ib_uverbs_odp_caps { + __u64 general_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + } per_transport_caps; + __u32 reserved; +}; + +struct ib_uverbs_rss_caps { + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + __u32 supported_qpts; + __u32 max_rwq_indirection_tables; + __u32 max_rwq_indirection_table_size; + __u32 reserved; +}; + +struct ib_uverbs_ex_query_device_resp { + struct ib_uverbs_query_device_resp base; + __u32 comp_mask; + __u32 response_length; + struct ib_uverbs_odp_caps odp_caps; + __u64 timestamp_mask; + __u64 hca_core_clock; /* in KHZ */ + __u64 device_cap_flags_ex; + struct ib_uverbs_rss_caps rss_caps; + __u32 max_wq_type_rq; + __u32 reserved; +}; + struct ib_uverbs_query_port { __u64 response; __u8 port_num; @@ -296,6 +323,22 @@ __u32 rkey; }; +struct ib_uverbs_rereg_mr { + __u64 response; + __u32 mr_handle; + __u32 flags; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; +}; + +struct ib_uverbs_rereg_mr_resp { + __u32 lkey; + __u32 rkey; +}; + struct ib_uverbs_dereg_mr { __u32 mr_handle; }; @@ -334,33 +377,25 @@ __u64 driver_data[0]; }; -struct ib_uverbs_create_cq_resp { - __u32 cq_handle; - __u32 cqe; -}; - -enum ib_uverbs_create_cq_ex_comp_mask { - IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS = (u64)1 << 0, -}; - -struct ib_uverbs_create_cq_ex { - __u64 comp_mask; +struct ib_uverbs_ex_create_cq { __u64 user_handle; __u32 cqe; __u32 comp_vector; __s32 comp_channel; + __u32 comp_mask; + __u32 flags; __u32 reserved; - __u64 create_flags; - __u64 driver_data[0]; }; -struct ib_uverbs_modify_cq_ex { - __u64 comp_mask; +struct ib_uverbs_create_cq_resp { __u32 cq_handle; - __u32 attr_mask; - __u16 cq_count; - __u16 cq_period; - __u32 cq_cap_flags; + __u32 cqe; +}; + +struct ib_uverbs_ex_create_cq_resp { + struct ib_uverbs_create_cq_resp base; + __u32 comp_mask; + __u32 response_length; }; struct ib_uverbs_resize_cq { @@ -502,6 +537,35 @@ __u64 driver_data[0]; }; +enum ib_uverbs_create_qp_mask { + IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0, +}; + +enum { + IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE, +}; + +struct ib_uverbs_ex_create_qp { + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u32 comp_mask; + __u32 create_flags; + __u32 rwq_ind_tbl_handle; + __u32 reserved1; +}; + struct ib_uverbs_open_qp { __u64 response; __u64 user_handle; @@ -524,6 +588,12 @@ __u32 reserved; }; +struct ib_uverbs_ex_create_qp_resp { + struct ib_uverbs_create_qp_resp base; + __u32 comp_mask; + __u32 response_length; +}; + /* * This struct needs to remain a multiple of 8 bytes to keep the * alignment of the modify QP parameters. @@ -614,42 +684,6 @@ __u64 driver_data[0]; }; -enum ib_uverbs_modify_qp_ex_comp_mask { - IB_UVERBS_QP_ATTR_DCT_KEY = 1ULL << 0, -}; - -struct ib_uverbs_modify_qp_ex { - __u32 comp_mask; - struct ib_uverbs_qp_dest dest; - struct ib_uverbs_qp_dest alt_dest; - __u32 qp_handle; - __u32 attr_mask; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 qp_state; - __u8 cur_qp_state; - __u8 path_mtu; - __u8 path_mig_state; - __u8 en_sqd_async_notify; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; - __u8 reserved[2]; - __u64 dct_key; - __u64 driver_data[0]; -}; - struct ib_uverbs_modify_qp_resp { }; @@ -784,18 +818,18 @@ }; struct ib_uverbs_flow_spec_hdr { - __u32 type; + __u32 type; __u16 size; __u16 reserved; /* followed by flow_spec */ __u64 flow_spec_data[0]; }; -struct ib_kern_eth_filter { - __u8 dst_mac[6]; - __u8 src_mac[6]; - __be16 ether_type; - __be16 vlan_tag; +struct ib_uverbs_flow_eth_filter { + __u8 dst_mac[6]; + __u8 src_mac[6]; + __be16 ether_type; + __be16 vlan_tag; }; struct ib_uverbs_flow_spec_eth { @@ -807,16 +841,20 @@ __u16 reserved; }; }; - struct ib_kern_eth_filter val; - struct ib_kern_eth_filter mask; + struct ib_uverbs_flow_eth_filter val; + struct ib_uverbs_flow_eth_filter mask; }; -struct ib_kern_ib_filter { - __be32 l3_type_qpn; - __u8 dst_gid[16]; +struct ib_uverbs_flow_ipv4_filter { + __be32 src_ip; + __be32 dst_ip; + __u8 proto; + __u8 tos; + __u8 ttl; + __u8 flags; }; -struct ib_uverbs_flow_spec_ib { +struct ib_uverbs_flow_spec_ipv4 { union { struct ib_uverbs_flow_spec_hdr hdr; struct { @@ -825,16 +863,16 @@ __u16 reserved; }; }; - struct ib_kern_ib_filter val; - struct ib_kern_ib_filter mask; + struct ib_uverbs_flow_ipv4_filter val; + struct ib_uverbs_flow_ipv4_filter mask; }; -struct ib_kern_ipv4_filter { - __be32 src_ip; - __be32 dst_ip; +struct ib_uverbs_flow_tcp_udp_filter { + __be16 dst_port; + __be16 src_port; }; -struct ib_uverbs_flow_spec_ipv4 { +struct ib_uverbs_flow_spec_tcp_udp { union { struct ib_uverbs_flow_spec_hdr hdr; struct { @@ -843,16 +881,21 @@ __u16 reserved; }; }; - struct ib_kern_ipv4_filter val; - struct ib_kern_ipv4_filter mask; + struct ib_uverbs_flow_tcp_udp_filter val; + struct ib_uverbs_flow_tcp_udp_filter mask; }; -struct ib_kern_tcp_udp_filter { - __be16 dst_port; - __be16 src_port; +struct ib_uverbs_flow_ipv6_filter { + __u8 src_ip[16]; + __u8 dst_ip[16]; + __be32 flow_label; + __u8 next_hdr; + __u8 traffic_class; + __u8 hop_limit; + __u8 reserved; }; -struct ib_uverbs_flow_spec_tcp_udp { +struct ib_uverbs_flow_spec_ipv6 { union { struct ib_uverbs_flow_spec_hdr hdr; struct { @@ -861,18 +904,18 @@ __u16 reserved; }; }; - struct ib_kern_tcp_udp_filter val; - struct ib_kern_tcp_udp_filter mask; + struct ib_uverbs_flow_ipv6_filter val; + struct ib_uverbs_flow_ipv6_filter mask; }; struct ib_uverbs_flow_attr { - __u32 type; - __u16 size; - __u16 priority; - __u8 num_of_specs; - __u8 reserved[2]; - __u8 port; - __u32 flags; + __u32 type; + __u16 size; + __u16 priority; + __u8 num_of_specs; + __u8 reserved[2]; + __u8 port; + __u32 flags; /* Following are the optional layers according to user request * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy @@ -959,22 +1002,66 @@ __u32 events_reported; }; +struct ib_uverbs_ex_create_wq { + __u32 comp_mask; + __u32 wq_type; + __u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 max_wr; + __u32 max_sge; +}; -/* - * Legacy extended verbs related structures - */ -struct ib_uverbs_ex_cmd_hdr_legacy { - __u32 command; - __u16 in_words; - __u16 out_words; - __u16 provider_in_words; - __u16 provider_out_words; - __u32 cmd_hdr_reserved; +struct ib_uverbs_ex_create_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 wq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 wqn; }; -struct ib_uverbs_ex_cmd_resp1_legacy { - __u64 comp_mask; - __u64 response; +struct ib_uverbs_ex_destroy_wq { + __u32 comp_mask; + __u32 wq_handle; +}; + +struct ib_uverbs_ex_destroy_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 events_reported; + __u32 reserved; +}; + +struct ib_uverbs_ex_modify_wq { + __u32 attr_mask; + __u32 wq_handle; + __u32 wq_state; + __u32 curr_wq_state; +}; + +/* Prevent memory allocation rather than max expected size */ +#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d +struct ib_uverbs_ex_create_rwq_ind_table { + __u32 comp_mask; + __u32 log_ind_tbl_size; + /* Following are the wq handles according to log_ind_tbl_size + * wq_handle1 + * wq_handle2 + */ + __u32 wq_handles[0]; +}; + +struct ib_uverbs_ex_create_rwq_ind_table_resp { + __u32 comp_mask; + __u32 response_length; + __u32 ind_tbl_handle; + __u32 ind_tbl_num; +}; + +struct ib_uverbs_ex_destroy_rwq_ind_table { + __u32 comp_mask; + __u32 ind_tbl_handle; }; #endif /* IB_USER_VERBS_H */ Index: sys/ofed/include/uapi/rdma/mlx4-abi.h =================================================================== --- /dev/null +++ sys/ofed/include/uapi/rdma/mlx4-abi.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_ABI_USER_H +#define MLX4_ABI_USER_H + +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + +struct mlx4_ib_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_ib_resize_cq { + __u64 buf_addr; +}; + +struct mlx4_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved[5]; +}; + +#endif /* MLX4_ABI_USER_H */ Index: sys/ofed/include/uapi/rdma/mlx5-abi.h =================================================================== --- /dev/null +++ sys/ofed/include/uapi/rdma/mlx5-abi.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_ABI_USER_H +#define MLX5_ABI_USER_H + +#include + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, + MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + +enum { + MLX5_WQ_FLAG_SIGNATURE = 1 << 0, +}; + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; +}; + +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 comp_mask; + __u8 max_cqe_version; + __u8 reserved0; + __u16 reserved1; + __u32 reserved2; +}; + +enum mlx5_ib_alloc_ucontext_resp_mask { + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, +}; + +enum mlx5_user_cmds_supp_uhw { + MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0, +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_uuars; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 reserved1; + __u32 comp_mask; + __u32 response_length; + __u8 cqe_version; + __u8 cmds_supp_uhw; + __u16 reserved2; + __u64 hca_core_clock_offset; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_tso_caps { + __u32 max_tso; /* Maximum tso payload size in bytes */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + __u32 supported_qpts; +}; + +struct mlx5_ib_rss_caps { + __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ + __u8 reserved[7]; +}; + +struct mlx5_ib_query_device_resp { + __u32 comp_mask; + __u32 response_length; + struct mlx5_ib_tso_caps tso_caps; + struct mlx5_ib_rss_caps rss_caps; +}; + +struct mlx5_ib_create_cq { + __u64 buf_addr; + __u64 db_addr; + __u32 cqe_size; + __u32 reserved; /* explicit padding (optional on i386) */ +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __u64 buf_addr; + __u64 db_addr; + __u32 flags; + __u32 reserved0; /* explicit padding (optional on i386) */ + __u32 uidx; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __u64 buf_addr; + __u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; + __u32 uidx; + __u32 reserved0; + __u64 sq_buf_addr; +}; + +/* RX Hash function flags */ +enum mlx5_rx_hash_function_flags { + MLX5_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP + * and *TCP and *UDP flags can't be enabled together on the same QP. +*/ +enum mlx5_rx_hash_fields { + MLX5_RX_HASH_SRC_IPV4 = 1 << 0, + MLX5_RX_HASH_DST_IPV4 = 1 << 1, + MLX5_RX_HASH_SRC_IPV6 = 1 << 2, + MLX5_RX_HASH_DST_IPV6 = 1 << 3, + MLX5_RX_HASH_SRC_PORT_TCP = 1 << 4, + MLX5_RX_HASH_DST_PORT_TCP = 1 << 5, + MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6, + MLX5_RX_HASH_DST_PORT_UDP = 1 << 7 +}; + +struct mlx5_ib_create_qp_rss { + __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ + __u8 rx_key_len; /* valid only for Toeplitz */ + __u8 reserved[6]; + __u8 rx_hash_key[128]; /* valid only for Toeplitz */ + __u32 comp_mask; + __u32 reserved1; +}; + +struct mlx5_ib_create_qp_resp { + __u32 uuar_index; +}; + +struct mlx5_ib_alloc_mw { + __u32 comp_mask; + __u8 num_klms; + __u8 reserved1; + __u16 reserved2; +}; + +struct mlx5_ib_create_wq { + __u64 buf_addr; + __u64 db_addr; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 user_index; + __u32 flags; + __u32 comp_mask; + __u32 reserved; +}; + +struct mlx5_ib_create_wq_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_create_rwq_ind_tbl_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_modify_wq { + __u32 comp_mask; + __u32 reserved; +}; +#endif /* MLX5_ABI_USER_H */ Index: sys/ofed/include/uapi/rdma/rdma_user_cm.h =================================================================== --- sys/ofed/include/uapi/rdma/rdma_user_cm.h +++ sys/ofed/include/uapi/rdma/rdma_user_cm.h @@ -34,6 +34,7 @@ #define RDMA_USER_CM_H #include +#include #include #include #include @@ -45,8 +46,8 @@ enum { RDMA_USER_CM_CMD_CREATE_ID, RDMA_USER_CM_CMD_DESTROY_ID, - RDMA_USER_CM_CMD_BIND_ADDR, - RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_BIND_IP, + RDMA_USER_CM_CMD_RESOLVE_IP, RDMA_USER_CM_CMD_RESOLVE_ROUTE, RDMA_USER_CM_CMD_QUERY_ROUTE, RDMA_USER_CM_CMD_CONNECT, @@ -59,9 +60,13 @@ RDMA_USER_CM_CMD_GET_OPTION, RDMA_USER_CM_CMD_SET_OPTION, RDMA_USER_CM_CMD_NOTIFY, - RDMA_USER_CM_CMD_JOIN_MCAST, + RDMA_USER_CM_CMD_JOIN_IP_MCAST, RDMA_USER_CM_CMD_LEAVE_MCAST, - RDMA_USER_CM_CMD_MIGRATE_ID + RDMA_USER_CM_CMD_MIGRATE_ID, + RDMA_USER_CM_CMD_QUERY, + RDMA_USER_CM_CMD_BIND, + RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_JOIN_MCAST }; /* @@ -95,28 +100,51 @@ __u32 events_reported; }; -struct rdma_ucm_bind_addr { +struct rdma_ucm_bind_ip { __u64 response; struct sockaddr_in6 addr; __u32 id; }; -struct rdma_ucm_resolve_addr { +struct rdma_ucm_bind { + __u32 id; + __u16 addr_size; + __u16 reserved; + struct sockaddr_storage addr; +}; + +struct rdma_ucm_resolve_ip { struct sockaddr_in6 src_addr; struct sockaddr_in6 dst_addr; __u32 id; __u32 timeout_ms; }; +struct rdma_ucm_resolve_addr { + __u32 id; + __u32 timeout_ms; + __u16 src_size; + __u16 dst_size; + __u32 reserved; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + struct rdma_ucm_resolve_route { __u32 id; __u32 timeout_ms; }; -struct rdma_ucm_query_route { +enum { + RDMA_USER_CM_QUERY_ADDR, + RDMA_USER_CM_QUERY_PATH, + RDMA_USER_CM_QUERY_GID +}; + +struct rdma_ucm_query { __u64 response; __u32 id; - __u32 reserved; + __u32 option; }; struct rdma_ucm_query_route_resp { @@ -129,9 +157,26 @@ __u8 reserved[3]; }; +struct rdma_ucm_query_addr_resp { + __u64 node_guid; + __u8 port_num; + __u8 reserved; + __u16 pkey; + __u16 src_size; + __u16 dst_size; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct rdma_ucm_query_path_resp { + __u32 num_paths; + __u32 reserved; + struct ib_path_rec_data path_data[0]; +}; + struct rdma_ucm_conn_param { __u32 qp_num; - __u32 reserved; + __u32 qkey; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; __u8 private_data_len; __u8 srq; @@ -192,13 +237,29 @@ __u32 event; }; -struct rdma_ucm_join_mcast { +struct rdma_ucm_join_ip_mcast { __u64 response; /* rdma_ucm_create_id_resp */ __u64 uid; struct sockaddr_in6 addr; __u32 id; }; +/* Multicast join flags */ +enum { + RDMA_MC_JOIN_FLAG_FULLMEMBER, + RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER, + RDMA_MC_JOIN_FLAG_RESERVED, +}; + +struct rdma_ucm_join_mcast { + __u64 response; /* rdma_ucma_create_id_resp */ + __u64 uid; + __u32 id; + __u16 addr_size; + __u16 join_flags; + struct sockaddr_storage addr; +}; + struct rdma_ucm_get_event { __u64 response; }; @@ -222,12 +283,10 @@ /* Option details */ enum { - RDMA_OPTION_ID_TOS = 0, + RDMA_OPTION_ID_TOS = 0, RDMA_OPTION_ID_REUSEADDR = 1, RDMA_OPTION_ID_AFONLY = 2, - - RDMA_OPTION_IB_PATH = 1, - RDMA_OPTION_IB_APM = 2, + RDMA_OPTION_IB_PATH = 1 }; struct rdma_ucm_set_option {