Index: head/sys/contrib/rdma/krping/krping.c =================================================================== --- head/sys/contrib/rdma/krping/krping.c (revision 277401) +++ head/sys/contrib/rdma/krping/krping.c (revision 277402) @@ -1,2343 +1,2345 @@ /* * Copyright (c) 2005 Ammasso, Inc. All rights reserved. * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "krping.h" #include "getopt.h" extern int krping_debug; #define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x) #define PRINTF(cb, x...) krping_printf((cb)->cookie, x) MODULE_AUTHOR("Steve Wise"); MODULE_DESCRIPTION("RDMA ping client/server"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(krping, 1); +MODULE_DEPEND(krping, linuxapi, 1, 1, 1); static __inline uint64_t get_cycles(void) { uint32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((u_int64_t)high << 32)); } typedef uint64_t cycles_t; enum mem_type { DMA = 1, FASTREG = 2, MW = 3, MR = 4 }; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, {"size", OPT_INT, 'S'}, {"addr", OPT_STRING, 'a'}, {"port", OPT_INT, 'p'}, {"verbose", OPT_NOPARAM, 'v'}, {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, {"mem_mode", OPT_STRING, 'm'}, {"server_inv", OPT_NOPARAM, 'I'}, {"wlat", OPT_NOPARAM, 'l'}, {"rlat", OPT_NOPARAM, 'L'}, {"bw", OPT_NOPARAM, 'B'}, {"duplex", OPT_NOPARAM, 'd'}, {"txdepth", OPT_INT, 'T'}, {"poll", OPT_NOPARAM, 'P'}, {"local_dma_lkey", OPT_NOPARAM, 'Z'}, {"read_inv", OPT_NOPARAM, 'R'}, {"fr", OPT_NOPARAM, 'f'}, {NULL, 0, 0} }; #define htonll(x) cpu_to_be64((x)) #define ntohll(x) cpu_to_be64((x)) static struct mutex krping_mutex; /* * List of running krping threads. */ static LIST_HEAD(krping_cbs); /* * krping "ping/pong" loop: * client sends source rkey/addr/len * server receives source rkey/add/len * server rdma reads "ping" data from source * server sends "go ahead" on rdma read completion * client sends sink rkey/addr/len * server receives sink rkey/addr/len * server rdma writes "pong" data to sink * server sends "go ahead" on rdma write completion * */ /* * These states are used to signal events between the completion handler * and the main client or server thread. * * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, * and RDMA_WRITE_COMPLETE for each ping. */ enum test_state { IDLE = 1, CONNECT_REQUEST, ADDR_RESOLVED, ROUTE_RESOLVED, CONNECTED, RDMA_READ_ADV, RDMA_READ_COMPLETE, RDMA_WRITE_ADV, RDMA_WRITE_COMPLETE, ERROR }; struct krping_rdma_info { uint64_t buf; uint32_t rkey; uint32_t size; }; /* * Default max buffer size for IO... */ #define RPING_BUFSIZE 128*1024 #define RPING_SQ_DEPTH 64 /* * Control block struct. */ struct krping_cb { void *cookie; int server; /* 0 iff client */ struct ib_cq *cq; struct ib_pd *pd; struct ib_qp *qp; enum mem_type mem; struct ib_mr *dma_mr; struct ib_fast_reg_page_list *page_list; int page_list_len; struct ib_send_wr fastreg_wr; struct ib_send_wr invalidate_wr; struct ib_mr *fastreg_mr; int server_invalidate; int read_inv; u8 key; struct ib_mw *mw; struct ib_mw_bind bind_attr; struct ib_recv_wr rq_wr; /* recv work request record */ struct ib_sge recv_sgl; /* recv single SGE */ struct krping_rdma_info recv_buf;/* malloc'd buffer */ u64 recv_dma_addr; DECLARE_PCI_UNMAP_ADDR(recv_mapping) struct ib_mr *recv_mr; struct ib_send_wr sq_wr; /* send work requrest record */ struct ib_sge send_sgl; struct krping_rdma_info send_buf;/* single send buf */ u64 send_dma_addr; DECLARE_PCI_UNMAP_ADDR(send_mapping) struct ib_mr *send_mr; struct ib_send_wr rdma_sq_wr; /* rdma work request record */ struct ib_sge rdma_sgl; /* rdma single SGE */ char *rdma_buf; /* used as rdma sink */ u64 rdma_dma_addr; DECLARE_PCI_UNMAP_ADDR(rdma_mapping) struct ib_mr *rdma_mr; uint32_t remote_rkey; /* remote guys RKEY */ uint64_t remote_addr; /* remote guys TO */ uint32_t remote_len; /* remote guys LEN */ char *start_buf; /* rdma read src */ u64 start_dma_addr; DECLARE_PCI_UNMAP_ADDR(start_mapping) struct ib_mr *start_mr; enum test_state state; /* used for cond/signalling */ wait_queue_head_t sem; struct krping_stats stats; uint16_t port; /* dst port in NBO */ struct in_addr addr; /* dst addr in NBO */ char *addr_str; /* dst addr string */ int verbose; /* verbose logging */ int count; /* ping count */ int size; /* ping data size */ int validate; /* validate ping data */ int wlat; /* run wlat test */ int rlat; /* run rlat test */ int bw; /* run bw test */ int duplex; /* run bw full duplex test */ int poll; /* poll or block for rlat test */ int txdepth; /* SQ depth */ int local_dma_lkey; /* use 0 for lkey */ int frtest; /* fastreg test */ /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ /* listener on server side. */ struct rdma_cm_id *child_cm_id; /* connection on server side */ struct list_head list; }; static int krping_cma_event_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret; struct krping_cb *cb = cma_id->context; DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { PRINTF(cb, "rdma_resolve_route error %d\n", ret); wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: cb->state = CONNECT_REQUEST; cb->child_cm_id = cma_id; DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id); wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: DEBUG_LOG(cb, "ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; } wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: PRINTF(cb, "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: PRINTF(cb, "DISCONNECT EVENT...\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: PRINTF(cb, "cma detected device removal!!!!\n"); break; default: PRINTF(cb, "oof bad type!\n"); wake_up_interruptible(&cb->sem); break; } return 0; } static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) cb->state = RDMA_READ_ADV; else cb->state = RDMA_WRITE_ADV; return 0; } static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } if (cb->state == RDMA_READ_ADV) cb->state = RDMA_WRITE_ADV; else cb->state = RDMA_WRITE_COMPLETE; return 0; } static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) { struct krping_cb *cb = ctx; struct ib_wc wc; struct ib_recv_wr *bad_wr; int ret; BUG_ON(cb->cq != cq); if (cb->state == ERROR) { PRINTF(cb, "cq completion in ERROR state\n"); return; } if (cb->frtest) { PRINTF(cb, "cq completion event in frtest!\n"); return; } if (!cb->wlat && !cb->rlat && !cb->bw) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { DEBUG_LOG(cb, "cq flushed\n"); continue; } else { PRINTF(cb, "cq completion failed with " "wr_id %Lx status %d opcode %d vender_err %x\n", wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; } } switch (wc.opcode) { case IB_WC_SEND: DEBUG_LOG(cb, "send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: DEBUG_LOG(cb, "rdma write completion\n"); cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: DEBUG_LOG(cb, "rdma read completion\n"); cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: DEBUG_LOG(cb, "recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; if (cb->wlat || cb->rlat || cb->bw) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { PRINTF(cb, "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "post recv error: %d\n", ret); goto error; } wake_up_interruptible(&cb->sem); break; default: PRINTF(cb, "%s:%d Unexpected opcode %d, Shutting down\n", __func__, __LINE__, wc.opcode); goto error; } } if (ret) { PRINTF(cb, "poll error %d\n", ret); goto error; } return; error: cb->state = ERROR; wake_up_interruptible(&cb->sem); } static int krping_accept(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; DEBUG_LOG(cb, "accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_accept error: %d\n", ret); return ret; } if (!cb->wlat && !cb->rlat && !cb->bw) { wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } } return 0; } static void krping_setup_wr(struct krping_cb *cb) { cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; if (cb->local_dma_lkey) cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->recv_sgl.lkey = cb->dma_mr->lkey; else cb->recv_sgl.lkey = cb->recv_mr->lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; if (cb->local_dma_lkey) cb->send_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->send_sgl.lkey = cb->dma_mr->lkey; else cb->send_sgl.lkey = cb->send_mr->lkey; cb->sq_wr.opcode = IB_WR_SEND; cb->sq_wr.send_flags = IB_SEND_SIGNALED; cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; if (cb->server || cb->wlat || cb->rlat || cb->bw) { cb->rdma_sgl.addr = cb->rdma_dma_addr; if (cb->mem == MR) cb->rdma_sgl.lkey = cb->rdma_mr->lkey; cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; cb->rdma_sq_wr.num_sge = 1; } switch(cb->mem) { case FASTREG: /* * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR. * both unsignaled. The client uses them to reregister * the rdma buffers with a new key each iteration. */ cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR; cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; cb->fastreg_wr.wr.fast_reg.length = cb->size; cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list; cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len; cb->invalidate_wr.next = &cb->fastreg_wr; cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; break; case MW: cb->bind_attr.wr_id = 0xabbaabba; cb->bind_attr.send_flags = 0; /* unsignaled */ cb->bind_attr.length = cb->size; break; default: break; } } static int krping_setup_buffers(struct krping_cb *cb) { int ret; struct ib_phys_buf buf; u64 iovbase; DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb); cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, &cb->recv_buf, sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, &cb->send_buf, sizeof(cb->send_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); if (cb->mem == DMA) { cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE); if (IS_ERR(cb->dma_mr)) { DEBUG_LOG(cb, "reg_dmamr failed\n"); ret = PTR_ERR(cb->dma_mr); goto bail; } } else { if (!cb->local_dma_lkey) { buf.addr = cb->recv_dma_addr; buf.size = sizeof cb->recv_buf; DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr, (int)buf.size); iovbase = cb->recv_dma_addr; cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_LOCAL_WRITE, &iovbase); if (IS_ERR(cb->recv_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->recv_mr); goto bail; } buf.addr = cb->send_dma_addr; buf.size = sizeof cb->send_buf; DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr, (int)buf.size); iovbase = cb->send_dma_addr; cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 0, &iovbase); if (IS_ERR(cb->send_mr)) { DEBUG_LOG(cb, "send_buf reg_mr failed\n"); ret = PTR_ERR(cb->send_mr); goto bail; } } } cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->rdma_buf) { DEBUG_LOG(cb, "rdma_buf malloc failed\n"); ret = -ENOMEM; goto bail; } cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, cb->rdma_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); if (cb->mem != DMA) { switch (cb->mem) { case FASTREG: cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; cb->page_list = ib_alloc_fast_reg_page_list( cb->pd->device, cb->page_list_len); if (IS_ERR(cb->page_list)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->page_list); goto bail; } cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, cb->page_list->max_page_list_len); if (IS_ERR(cb->fastreg_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->fastreg_mr); goto bail; } DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p" " page_list_len %u\n", cb->fastreg_mr->rkey, cb->page_list, cb->page_list_len); break; case MW: cb->mw = ib_alloc_mw(cb->pd); if (IS_ERR(cb->mw)) { DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); ret = PTR_ERR(cb->mw); goto bail; } DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey); /*FALLTHROUGH*/ case MR: buf.addr = cb->rdma_dma_addr; buf.size = cb->size; iovbase = cb->rdma_dma_addr; cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE, &iovbase); if (IS_ERR(cb->rdma_mr)) { DEBUG_LOG(cb, "rdma_buf reg_mr failed\n"); ret = PTR_ERR(cb->rdma_mr); goto bail; } DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n", buf.addr, (int)buf.size, cb->rdma_mr->rkey); break; default: ret = -EINVAL; goto bail; break; } } if (!cb->server || cb->wlat || cb->rlat || cb->bw) { cb->start_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->start_buf) { DEBUG_LOG(cb, "start_buf malloc failed\n"); ret = -ENOMEM; goto bail; } cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, cb->start_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); if (cb->mem == MR || cb->mem == MW) { unsigned flags = IB_ACCESS_REMOTE_READ; if (cb->wlat || cb->rlat || cb->bw) flags |= IB_ACCESS_REMOTE_WRITE; buf.addr = cb->start_dma_addr; buf.size = cb->size; DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n", buf.addr, (int)buf.size); iovbase = cb->start_dma_addr; cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, flags, &iovbase); if (IS_ERR(cb->start_mr)) { DEBUG_LOG(cb, "start_buf reg_mr failed\n"); ret = PTR_ERR(cb->start_mr); goto bail; } } } krping_setup_wr(cb); DEBUG_LOG(cb, "allocated & registered buffers...\n"); return 0; bail: if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr)) ib_dereg_mr(cb->fastreg_mr); if (cb->mw && !IS_ERR(cb->mw)) ib_dealloc_mw(cb->mw); if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); if (cb->page_list && !IS_ERR(cb->page_list)) ib_free_fast_reg_page_list(cb->page_list); if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); if (cb->recv_mr && !IS_ERR(cb->recv_mr)) ib_dereg_mr(cb->recv_mr); if (cb->send_mr && !IS_ERR(cb->send_mr)) ib_dereg_mr(cb->send_mr); if (cb->rdma_buf) kfree(cb->rdma_buf); if (cb->start_buf) kfree(cb->start_buf); return ret; } static void krping_free_buffers(struct krping_cb *cb) { DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb); if (cb->dma_mr) ib_dereg_mr(cb->dma_mr); if (cb->send_mr) ib_dereg_mr(cb->send_mr); if (cb->recv_mr) ib_dereg_mr(cb->recv_mr); if (cb->rdma_mr) ib_dereg_mr(cb->rdma_mr); if (cb->start_mr) ib_dereg_mr(cb->start_mr); if (cb->fastreg_mr) ib_dereg_mr(cb->fastreg_mr); if (cb->mw) ib_dealloc_mw(cb->mw); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, send_mapping), sizeof(cb->send_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, rdma_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->rdma_buf); if (cb->start_buf) { dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, start_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->start_buf); } } static int krping_create_qp(struct krping_cb *cb) { struct ib_qp_init_attr init_attr; int ret; memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = cb->txdepth; init_attr.cap.max_recv_wr = 2; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = cb->cq; init_attr.recv_cq = cb->cq; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; if (cb->server) { ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->child_cm_id->qp; } else { ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->cm_id->qp; } return ret; } static void krping_free_qp(struct krping_cb *cb) { ib_destroy_qp(cb->qp); ib_destroy_cq(cb->cq); ib_dealloc_pd(cb->pd); } static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) { int ret; cb->pd = ib_alloc_pd(cm_id->device); if (IS_ERR(cb->pd)) { PRINTF(cb, "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } DEBUG_LOG(cb, "created pd %p\n", cb->pd); strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, cb, cb->txdepth * 2, 0); if (IS_ERR(cb->cq)) { PRINTF(cb, "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } DEBUG_LOG(cb, "created cq %p\n", cb->cq); if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); if (ret) { PRINTF(cb, "ib_create_cq failed\n"); goto err2; } } ret = krping_create_qp(cb); if (ret) { PRINTF(cb, "krping_create_qp failed: %d\n", ret); goto err2; } DEBUG_LOG(cb, "created qp %p\n", cb->qp); return 0; err2: ib_destroy_cq(cb->cq); err1: ib_dealloc_pd(cb->pd); return ret; } /* * return the (possibly rebound) rkey for the rdma buffer. * FASTREG mode: invalidate and rebind via fastreg wr. * MW mode: rebind the MW. * other modes: just return the mr rkey. */ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) { u32 rkey = 0xffffffff; u64 p; struct ib_send_wr *bad_wr; int i; int ret; switch (cb->mem) { case FASTREG: cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey; /* * Update the fastreg key. */ ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key); cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey; /* * Update the fastreg WR with new buf info. */ if (buf == (u64)cb->start_dma_addr) cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ; else cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; cb->fastreg_wr.wr.fast_reg.iova_start = buf; p = (u64)(buf & PAGE_MASK); for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; i++, p += PAGE_SIZE) { cb->page_list->page_list[i] = p; DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p); } DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u" " iova_start %llx page_list_len %u\n", post_inv, cb->fastreg_wr.wr.fast_reg.rkey, cb->fastreg_wr.wr.fast_reg.page_shift, cb->fastreg_wr.wr.fast_reg.length, cb->fastreg_wr.wr.fast_reg.iova_start, cb->fastreg_wr.wr.fast_reg.page_list_len); if (post_inv) ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); else ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); cb->state = ERROR; } rkey = cb->fastreg_mr->rkey; break; case MW: /* * Update the MW with new buf info. */ if (buf == (u64)cb->start_dma_addr) { cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ; cb->bind_attr.mr = cb->start_mr; } else { cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE; cb->bind_attr.mr = cb->rdma_mr; } cb->bind_attr.addr = buf; DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n", cb->mw->rkey, buf, cb->bind_attr.mr->rkey); ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr); if (ret) { PRINTF(cb, "bind mw error %d\n", ret); cb->state = ERROR; } else rkey = cb->mw->rkey; break; case MR: if (buf == (u64)cb->start_dma_addr) rkey = cb->start_mr->rkey; else rkey = cb->rdma_mr->rkey; break; case DMA: rkey = cb->dma_mr->rkey; break; default: PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__); cb->state = ERROR; break; } return rkey; } static void krping_format_send(struct krping_cb *cb, u64 buf) { struct krping_rdma_info *info = &cb->send_buf; u32 rkey; /* * Client side will do fastreg or mw bind before * advertising the rdma buffer. Server side * sends have no data. */ if (!cb->server || cb->wlat || cb->rlat || cb->bw) { rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate); info->buf = htonll(buf); info->rkey = htonl(rkey); info->size = htonl(cb->size); DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n", (unsigned long long)buf, rkey, cb->size); } } static void krping_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr, inv; int ret; while (1) { /* Wait for client's Start STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV); if (cb->state != RDMA_READ_ADV) { PRINTF(cb, "wait for RDMA_READ_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->remote_len; cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1); cb->rdma_sq_wr.next = NULL; /* Issue RDMA Read. */ if (cb->read_inv) cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV; else { cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; if (cb->mem == FASTREG) { /* * Immediately follow the read with a * fenced LOCAL_INV. */ cb->rdma_sq_wr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = cb->fastreg_mr->rkey; inv.send_flags = IB_SEND_FENCE; } } ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } cb->rdma_sq_wr.next = NULL; DEBUG_LOG(cb, "server posted rdma read req \n"); /* Wait for read completion */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_COMPLETE); if (cb->state != RDMA_READ_COMPLETE) { PRINTF(cb, "wait for RDMA_READ_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received read complete\n"); /* Display data in recv buf */ if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "server ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "server ping data: %s\n", cb->rdma_buf); } /* Tell client to continue */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); /* Wait for client's RDMA STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); /* RDMA Write echo data */ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; if (cb->local_dma_lkey) cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey; else cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0); DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n", cb->rdma_sq_wr.sg_list->lkey, (unsigned long long)cb->rdma_sq_wr.sg_list->addr, cb->rdma_sq_wr.sg_list->length); ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for completion */ ret = wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server rdma write complete \n"); cb->state = CONNECTED; /* Tell client to begin again */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); } } static void rlat_test(struct krping_cb *cb) { int scnt; int iters = cb->count; struct timeval start_tv, stop_tv; int ret; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; scnt = 0; cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; microtime(&start_tv); if (!cb->poll) { cb->state = RDMA_READ_ADV; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } while (scnt < iters) { cb->state = RDMA_READ_ADV; ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "Couldn't post send: ret=%d scnt %d\n", ret, scnt); return; } do { if (!cb->poll) { wait_event_interruptible(cb->sem, cb->state != RDMA_READ_ADV); if (cb->state == RDMA_READ_COMPLETE) { ne = 1; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } else { ne = -1; } } else ne = ib_poll_cq(cb->cq, 1, &wc); if (cb->state == ERROR) { PRINTF(cb, "state == ERROR...bailing scnt %d\n", scnt); return; } } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (cb->poll && wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } ++scnt; } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n", stop_tv.tv_sec - start_tv.tv_sec, stop_tv.tv_usec - start_tv.tv_usec, scnt, cb->size); } static void wlat_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; volatile char *poll_buf = (char *) cb->start_buf; char *buf = (char *)cb->rdma_buf; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < iters && !(scnt < 1 && !cb->server)) { ++rcnt; while (*poll_buf != (char)rcnt) { if (cb->state == ERROR) { PRINTF(cb, "state = ERROR, bailing\n"); return; } } } if (scnt < iters) { struct ib_send_wr *bad_wr; *buf = (char)scnt+1; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); scnt++; } if (ccnt < iters) { struct ib_wc wc; int ne; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ++ccnt; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); PRINTF(cb, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", stop_tv.tv_sec - start_tv.tv_sec, stop_tv.tv_usec - start_tv.tv_usec, scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void bw_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters) { while (scnt < iters && scnt - ccnt < cb->txdepth) { struct ib_send_wr *bad_wr; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); ++scnt; } if (ccnt < iters) { int ne; struct ib_wc wc; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ccnt += 1; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", stop_tv.tv_sec - start_tv.tv_sec, stop_tv.tv_usec - start_tv.tv_usec, scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void krping_rlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_wlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wlat_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_bw_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) bw_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static int fastreg_supported(struct krping_cb *cb) { struct ib_device *dev = cb->child_cm_id->device; struct ib_device_attr attr; int ret; ret = ib_query_device(dev, &attr); if (ret) { PRINTF(cb, "ib_query_device failed ret %d\n", ret); return 0; } if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n", attr.device_cap_flags); return 0; } DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n", attr.device_cap_flags); return 1; } static int krping_bind_server(struct krping_cb *cb) { struct sockaddr_in sin; int ret; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = cb->addr.s_addr; sin.sin_port = cb->port; ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); if (ret) { PRINTF(cb, "rdma_bind_addr error %d\n", ret); return ret; } DEBUG_LOG(cb, "rdma_bind_addr successful\n"); DEBUG_LOG(cb, "rdma_listen\n"); ret = rdma_listen(cb->cm_id, 3); if (ret) { PRINTF(cb, "rdma_listen failed: %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST); if (cb->state != CONNECT_REQUEST) { PRINTF(cb, "wait for CONNECT_REQUEST state %d\n", cb->state); return -1; } if (cb->mem == FASTREG && !fastreg_supported(cb)) return -EINVAL; return 0; } static void krping_run_server(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_server(cb); if (ret) return; ret = krping_setup_qp(cb, cb->child_cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); goto err0; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_accept(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_server(cb); else if (cb->rlat) krping_rlat_test_server(cb); else if (cb->bw) krping_bw_test_server(cb); else krping_test_server(cb); rdma_disconnect(cb->child_cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); err0: rdma_destroy_id(cb->child_cm_id); } static void krping_test_client(struct krping_cb *cb) { int ping, start, cc, i, ret; struct ib_send_wr *bad_wr; unsigned char c; start = 65; for (ping = 0; !cb->count || ping < cb->count; ping++) { cb->state = RDMA_READ_ADV; /* Put some ascii text in the buffer. */ cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); for (i = cc, c = start; i < cb->size; i++) { cb->start_buf[i] = c; c++; if (c > 122) c = 65; } start++; if (start > 122) start = 65; cb->start_buf[cb->size - 1] = 0; krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); break; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for server to ACK */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } krping_format_send(cb, cb->rdma_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for the server to say the RDMA Write is complete. */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } if (cb->validate) if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { PRINTF(cb, "data mismatch!\n"); break; } if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "ping data: %s\n", cb->rdma_buf); } #ifdef SLOW_KRPING wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); #endif } } static void krping_rlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } #if 0 { int i; struct timeval start, stop; time_t sec; suseconds_t usec; unsigned long long elapsed; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = 0; cb->rdma_sq_wr.num_sge = 0; microtime(&start); for (i=0; i < 100000; i++) { if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send\n"); return; } do { ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } microtime(&stop); if (stop.tv_usec < start.tv_usec) { stop.tv_usec += 1000000; stop.tv_sec -= 1; } sec = stop.tv_sec - start.tv_sec; usec = stop.tv_usec - start.tv_usec; elapsed = sec * 1000000 + usec; PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed); } #endif rlat_test(cb); } static void krping_wlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } wlat_test(cb); } static void krping_bw_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } bw_test(cb); } static void krping_fr_test(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, inv, *bad; struct ib_wc wc; u8 key = 0; struct ib_mr *mr; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; time_t start; int count = 0; int scnt = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = 0xcafebabe | i; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.send_flags = IB_SEND_SIGNALED; DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); start = time_uptime; while (1) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); wait_event_interruptible(cb->sem, cb->state == ERROR); if (cb->state == ERROR) break; start = time_uptime; } while (scnt < (cb->txdepth>>1)) { ib_update_fast_reg_key(mr, ++key); fr.wr.fast_reg.rkey = mr->rkey; inv.ex.invalidate_rkey = mr->rkey; size = arc4random() % cb->size; if (size == 0) size = cb->size; plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list_len = plen; ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } scnt++; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u\n", wc.status); goto err2; } count++; scnt--; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } err2: #if 0 DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); #endif DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "fr_test: done!\n"); ib_dereg_mr(mr); err1: ib_free_fast_reg_page_list(pl); } static int krping_connect_client(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 10; ret = rdma_connect(cb->cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_connect error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } DEBUG_LOG(cb, "rdma_connect successful\n"); return 0; } static int krping_bind_client(struct krping_cb *cb) { struct sockaddr_in sin; int ret; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = cb->addr.s_addr; sin.sin_port = cb->port; ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, 2000); if (ret) { PRINTF(cb, "rdma_resolve_addr error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED); if (cb->state != ROUTE_RESOLVED) { PRINTF(cb, "addr/route resolution did not resolve: state %d\n", cb->state); return -EINTR; } if (cb->mem == FASTREG && !fastreg_supported(cb)) return -EINVAL; DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n"); return 0; } static void krping_run_client(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_client(cb); if (ret) return; ret = krping_setup_qp(cb, cb->cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); return; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_connect_client(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_client(cb); else if (cb->rlat) krping_rlat_test_client(cb); else if (cb->bw) krping_bw_test_client(cb); else if (cb->frtest) krping_fr_test(cb); else krping_test_client(cb); rdma_disconnect(cb->cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); } int krping_doit(char *cmd, void *cookie) { struct krping_cb *cb; int op; int ret = 0; char *optarg; unsigned long optint; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return -ENOMEM; mutex_lock(&krping_mutex); list_add_tail(&cb->list, &krping_cbs); mutex_unlock(&krping_mutex); cb->cookie = cookie; cb->server = -1; cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; cb->mem = DMA; init_waitqueue_head(&cb->sem); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, &optint)) != 0) { switch (op) { case 'a': cb->addr_str = optarg; DEBUG_LOG(cb, "ipaddr (%s)\n", optarg); if (!inet_aton(optarg, &cb->addr)) { PRINTF(cb, "bad addr string %s\n", optarg); ret = EINVAL; } break; case 'p': cb->port = htons(optint); DEBUG_LOG(cb, "port %d\n", (int)optint); break; case 'P': cb->poll = 1; DEBUG_LOG(cb, "server\n"); break; case 's': cb->server = 1; DEBUG_LOG(cb, "server\n"); break; case 'c': cb->server = 0; DEBUG_LOG(cb, "client\n"); break; case 'S': cb->size = optint; if ((cb->size < 1) || (cb->size > RPING_BUFSIZE)) { PRINTF(cb, "Invalid size %d " "(valid range is 1 to %d)\n", cb->size, RPING_BUFSIZE); ret = EINVAL; } else DEBUG_LOG(cb, "size %d\n", (int)optint); break; case 'C': cb->count = optint; if (cb->count < 0) { PRINTF(cb, "Invalid count %d\n", cb->count); ret = EINVAL; } else DEBUG_LOG(cb, "count %d\n", (int) cb->count); break; case 'v': cb->verbose++; DEBUG_LOG(cb, "verbose\n"); break; case 'V': cb->validate++; DEBUG_LOG(cb, "validate data\n"); break; case 'l': cb->wlat++; break; case 'L': cb->rlat++; break; case 'B': cb->bw++; break; case 'd': cb->duplex++; break; case 'm': if (!strncmp(optarg, "dma", 3)) cb->mem = DMA; else if (!strncmp(optarg, "fastreg", 7)) cb->mem = FASTREG; else if (!strncmp(optarg, "mw", 2)) cb->mem = MW; else if (!strncmp(optarg, "mr", 2)) cb->mem = MR; else { PRINTF(cb, "unknown mem mode %s. " "Must be dma, fastreg, mw, or mr\n", optarg); ret = -EINVAL; break; } break; case 'I': cb->server_invalidate = 1; break; case 'T': cb->txdepth = optint; DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth); break; case 'Z': cb->local_dma_lkey = 1; DEBUG_LOG(cb, "using local dma lkey\n"); break; case 'R': cb->read_inv = 1; DEBUG_LOG(cb, "using read-with-inv\n"); break; case 'f': cb->frtest = 1; DEBUG_LOG(cb, "fast-reg test!\n"); break; default: PRINTF(cb, "unknown opt %s\n", optarg); ret = -EINVAL; break; } } if (ret) goto out; if (cb->server == -1) { PRINTF(cb, "must be either client or server\n"); ret = -EINVAL; goto out; } if (cb->server && cb->frtest) { PRINTF(cb, "must be client to run frtest\n"); ret = -EINVAL; goto out; } if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n"); ret = -EINVAL; goto out; } if (cb->server_invalidate && cb->mem != FASTREG) { PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->read_inv && cb->mem != FASTREG) { PRINTF(cb, "read_inv only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) { PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n"); ret = -EINVAL; goto out; } cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); PRINTF(cb, "rdma_create_id error %d\n", ret); goto out; } DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id); if (cb->server) krping_run_server(cb); else krping_run_client(cb); DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id); rdma_destroy_id(cb->cm_id); out: mutex_lock(&krping_mutex); list_del(&cb->list); mutex_unlock(&krping_mutex); kfree(cb); return ret; } void krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg) { struct krping_cb *cb; mutex_lock(&krping_mutex); list_for_each_entry(cb, &krping_cbs, list) (*f)(cb->pd ? &cb->stats : NULL, arg); mutex_unlock(&krping_mutex); } void krping_init(void) { mutex_init(&krping_mutex); } Index: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c =================================================================== --- head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c (revision 277401) +++ head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c (revision 277402) @@ -1,301 +1,300 @@ /************************************************************************** Copyright (c) 2007, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include #include #include #include #include #include static int iwch_mod_load(void); static int iwch_mod_unload(void); static int iwch_activate(struct adapter *); static int iwch_deactivate(struct adapter *); static struct uld_info iwch_uld_info = { .uld_id = ULD_IWARP, .activate = iwch_activate, .deactivate = iwch_deactivate, }; static void rnic_init(struct iwch_dev *rnicp) { idr_init(&rnicp->cqidr); idr_init(&rnicp->qpidr); idr_init(&rnicp->mmidr); mtx_init(&rnicp->lock, "iwch rnic lock", NULL, MTX_DEF|MTX_DUPOK); rnicp->attr.vendor_id = 0x168; rnicp->attr.vendor_part_id = 7; rnicp->attr.max_qps = T3_MAX_NUM_QP - 32; rnicp->attr.max_wrs = T3_MAX_QP_DEPTH; rnicp->attr.max_sge_per_wr = T3_MAX_SGE; rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE; rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1; rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH; rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev); rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK; rnicp->attr.max_mr_size = T3_MAX_MR_SIZE; rnicp->attr.can_resize_wq = 0; rnicp->attr.max_rdma_reads_per_qp = 8; rnicp->attr.max_rdma_read_resources = rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps; rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */ rnicp->attr.max_rdma_read_depth = rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps; rnicp->attr.rq_overflow_handled = 0; rnicp->attr.can_modify_ird = 0; rnicp->attr.can_modify_ord = 0; rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1; rnicp->attr.stag0_value = 1; rnicp->attr.zbva_support = 1; rnicp->attr.local_invalidate_fence = 1; rnicp->attr.cq_overflow_detection = 1; return; } static void rnic_uninit(struct iwch_dev *rnicp) { idr_destroy(&rnicp->cqidr); idr_destroy(&rnicp->qpidr); idr_destroy(&rnicp->mmidr); mtx_destroy(&rnicp->lock); } static int iwch_activate(struct adapter *sc) { struct iwch_dev *rnicp; int rc; KASSERT(!isset(&sc->offload_map, MAX_NPORTS), ("%s: iWARP already activated on %s", __func__, device_get_nameunit(sc->dev))); rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); if (rnicp == NULL) return (ENOMEM); sc->iwarp_softc = rnicp; rnicp->rdev.adap = sc; cxio_hal_init(sc); iwch_cm_init_cpl(sc); rc = cxio_rdev_open(&rnicp->rdev); if (rc != 0) { printf("Unable to open CXIO rdev\n"); goto err1; } rnic_init(rnicp); rc = iwch_register_device(rnicp); if (rc != 0) { printf("Unable to register device\n"); goto err2; } return (0); err2: rnic_uninit(rnicp); cxio_rdev_close(&rnicp->rdev); err1: cxio_hal_uninit(sc); iwch_cm_term_cpl(sc); sc->iwarp_softc = NULL; return (rc); } static int iwch_deactivate(struct adapter *sc) { struct iwch_dev *rnicp; rnicp = sc->iwarp_softc; iwch_unregister_device(rnicp); rnic_uninit(rnicp); cxio_rdev_close(&rnicp->rdev); cxio_hal_uninit(sc); iwch_cm_term_cpl(sc); ib_dealloc_device(&rnicp->ibdev); sc->iwarp_softc = NULL; return (0); } static void iwch_activate_all(struct adapter *sc, void *arg __unused) { ADAPTER_LOCK(sc); if ((sc->open_device_map & sc->offload_map) != 0 && t3_activate_uld(sc, ULD_IWARP) == 0) setbit(&sc->offload_map, MAX_NPORTS); ADAPTER_UNLOCK(sc); } static void iwch_deactivate_all(struct adapter *sc, void *arg __unused) { ADAPTER_LOCK(sc); if (isset(&sc->offload_map, MAX_NPORTS) && t3_deactivate_uld(sc, ULD_IWARP) == 0) clrbit(&sc->offload_map, MAX_NPORTS); ADAPTER_UNLOCK(sc); } static int iwch_mod_load(void) { int rc; rc = iwch_cm_init(); if (rc != 0) return (rc); rc = t3_register_uld(&iwch_uld_info); if (rc != 0) { iwch_cm_term(); return (rc); } t3_iterate(iwch_activate_all, NULL); return (rc); } static int iwch_mod_unload(void) { t3_iterate(iwch_deactivate_all, NULL); iwch_cm_term(); if (t3_unregister_uld(&iwch_uld_info) == EBUSY) return (EBUSY); return (0); } #endif /* TCP_OFFLOAD */ -#undef MODULE_VERSION -#include - static int iwch_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = iwch_mod_load(); if(rc) printf("iw_cxgb: Chelsio T3 RDMA Driver failed to load\n"); else printf("iw_cxgb: Chelsio T3 RDMA Driver loaded\n"); break; case MOD_UNLOAD: rc = iwch_mod_unload(); if(rc) printf("iw_cxgb: Chelsio T3 RDMA Driver failed to unload\n"); else printf("iw_cxgb: Chelsio T3 RDMA Driver unloaded\n"); break; default: rc = EINVAL; } #else printf("iw_cxgb: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t iwch_mod_data = { "iw_cxgb", iwch_modevent, 0 }; MODULE_VERSION(iw_cxgb, 1); DECLARE_MODULE(iw_cxgb, iwch_mod_data, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(t3_tom, cxgbc, 1, 1, 1); MODULE_DEPEND(iw_cxgb, toecore, 1, 1, 1); MODULE_DEPEND(iw_cxgb, t3_tom, 1, 1, 1); MODULE_DEPEND(iw_cxgb, ibcore, 1, 1, 1); +MODULE_DEPEND(iw_cxgb, linuxapi, 1, 1, 1); + Index: head/sys/dev/cxgbe/iw_cxgbe/device.c =================================================================== --- head/sys/dev/cxgbe/iw_cxgbe/device.c (revision 277401) +++ head/sys/dev/cxgbe/iw_cxgbe/device.c (revision 277402) @@ -1,369 +1,368 @@ /* * Copyright (c) 2009-2013 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #ifdef TCP_OFFLOAD #include "iw_cxgbe.h" int spg_creds = 2; /* Default status page size is 2 credits = 128B */ void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) { struct list_head *pos, *nxt; struct c4iw_qid_list *entry; mutex_lock(&uctx->lock); list_for_each_safe(pos, nxt, &uctx->qpids) { entry = list_entry(pos, struct c4iw_qid_list, entry); list_del_init(&entry->entry); if (!(entry->qid & rdev->qpmask)) { c4iw_put_resource(&rdev->resource.qid_table, entry->qid); mutex_lock(&rdev->stats.lock); rdev->stats.qid.cur -= rdev->qpmask + 1; mutex_unlock(&rdev->stats.lock); } kfree(entry); } list_for_each_safe(pos, nxt, &uctx->qpids) { entry = list_entry(pos, struct c4iw_qid_list, entry); list_del_init(&entry->entry); kfree(entry); } mutex_unlock(&uctx->lock); } void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx) { INIT_LIST_HEAD(&uctx->qpids); INIT_LIST_HEAD(&uctx->cqids); mutex_init(&uctx->lock); } static int c4iw_rdev_open(struct c4iw_rdev *rdev) { struct adapter *sc = rdev->adap; int rc; c4iw_init_dev_ucontext(rdev, &rdev->uctx); /* Save the status page size set by if_cxgbe */ spg_creds = (t4_read_reg(sc, A_SGE_CONTROL) & F_EGRSTATUSPAGESIZE) ? 2 : 1; /* XXX: we can probably make this work */ if (sc->sge.eq_s_qpp > PAGE_SHIFT || sc->sge.iq_s_qpp > PAGE_SHIFT) { device_printf(sc->dev, "doorbell density too high (eq %d, iq %d, pg %d).\n", sc->sge.eq_s_qpp, sc->sge.eq_s_qpp, PAGE_SHIFT); rc = -EINVAL; goto err1; } rdev->qpshift = PAGE_SHIFT - sc->sge.eq_s_qpp; rdev->qpmask = (1 << sc->sge.eq_s_qpp) - 1; rdev->cqshift = PAGE_SHIFT - sc->sge.iq_s_qpp; rdev->cqmask = (1 << sc->sge.iq_s_qpp) - 1; if (c4iw_num_stags(rdev) == 0) { rc = -EINVAL; goto err1; } rdev->stats.pd.total = T4_MAX_NUM_PD; rdev->stats.stag.total = sc->vres.stag.size; rdev->stats.pbl.total = sc->vres.pbl.size; rdev->stats.rqt.total = sc->vres.rq.size; rdev->stats.qid.total = sc->vres.qp.size; rc = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD); if (rc) { device_printf(sc->dev, "error %d initializing resources\n", rc); goto err1; } rc = c4iw_pblpool_create(rdev); if (rc) { device_printf(sc->dev, "error %d initializing pbl pool\n", rc); goto err2; } rc = c4iw_rqtpool_create(rdev); if (rc) { device_printf(sc->dev, "error %d initializing rqt pool\n", rc); goto err3; } return (0); err3: c4iw_pblpool_destroy(rdev); err2: c4iw_destroy_resource(&rdev->resource); err1: return (rc); } static void c4iw_rdev_close(struct c4iw_rdev *rdev) { c4iw_pblpool_destroy(rdev); c4iw_rqtpool_destroy(rdev); c4iw_destroy_resource(&rdev->resource); } static void c4iw_dealloc(struct c4iw_dev *iwsc) { c4iw_rdev_close(&iwsc->rdev); idr_destroy(&iwsc->cqidr); idr_destroy(&iwsc->qpidr); idr_destroy(&iwsc->mmidr); ib_dealloc_device(&iwsc->ibdev); } static struct c4iw_dev * c4iw_alloc(struct adapter *sc) { struct c4iw_dev *iwsc; int rc; iwsc = (struct c4iw_dev *)ib_alloc_device(sizeof(*iwsc)); if (iwsc == NULL) { device_printf(sc->dev, "Cannot allocate ib device.\n"); return (ERR_PTR(-ENOMEM)); } iwsc->rdev.adap = sc; rc = c4iw_rdev_open(&iwsc->rdev); if (rc != 0) { device_printf(sc->dev, "Unable to open CXIO rdev (%d)\n", rc); ib_dealloc_device(&iwsc->ibdev); return (ERR_PTR(rc)); } idr_init(&iwsc->cqidr); idr_init(&iwsc->qpidr); idr_init(&iwsc->mmidr); spin_lock_init(&iwsc->lock); mutex_init(&iwsc->rdev.stats.lock); return (iwsc); } static int c4iw_mod_load(void); static int c4iw_mod_unload(void); static int c4iw_activate(struct adapter *); static int c4iw_deactivate(struct adapter *); static struct uld_info c4iw_uld_info = { .uld_id = ULD_IWARP, .activate = c4iw_activate, .deactivate = c4iw_deactivate, }; static int c4iw_activate(struct adapter *sc) { struct c4iw_dev *iwsc; int rc; ASSERT_SYNCHRONIZED_OP(sc); if (isset(&sc->offload_map, MAX_NPORTS)) { KASSERT(0, ("%s: RDMA already eanbled on sc %p", __func__, sc)); return (0); } if (sc->rdmacaps == 0) { device_printf(sc->dev, "RDMA not supported or RDMA cap is not enabled.\n"); return (ENOSYS); } iwsc = c4iw_alloc(sc); if (IS_ERR(iwsc)) { rc = -PTR_ERR(iwsc); device_printf(sc->dev, "initialization failed: %d\n", rc); return (rc); } sc->iwarp_softc = iwsc; c4iw_cm_init_cpl(sc); rc = -c4iw_register_device(iwsc); if (rc) { device_printf(sc->dev, "RDMA registration failed: %d\n", rc); c4iw_dealloc(iwsc); sc->iwarp_softc = NULL; } return (rc); } static int c4iw_deactivate(struct adapter *sc) { struct c4iw_dev *iwsc = sc->iwarp_softc; ASSERT_SYNCHRONIZED_OP(sc); c4iw_unregister_device(iwsc); c4iw_dealloc(iwsc); sc->iwarp_softc = NULL; return (0); } static void c4iw_activate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4iwact") != 0) return; if (!isset(&sc->offload_map, MAX_NPORTS) && t4_activate_uld(sc, ULD_IWARP) == 0) setbit(&sc->offload_map, MAX_NPORTS); end_synchronized_op(sc, 0); } static void c4iw_deactivate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4iwdea") != 0) return; if (isset(&sc->offload_map, MAX_NPORTS) && t4_deactivate_uld(sc, ULD_IWARP) == 0) clrbit(&sc->offload_map, MAX_NPORTS); end_synchronized_op(sc, 0); } static int c4iw_mod_load(void) { int rc; rc = -c4iw_cm_init(); if (rc != 0) return (rc); rc = t4_register_uld(&c4iw_uld_info); if (rc != 0) { c4iw_cm_term(); return (rc); } t4_iterate(c4iw_activate_all, NULL); return (rc); } static int c4iw_mod_unload(void) { t4_iterate(c4iw_deactivate_all, NULL); c4iw_cm_term(); if (t4_unregister_uld(&c4iw_uld_info) == EBUSY) return (EBUSY); return (0); } #endif -#undef MODULE_VERSION -#include /* * t4_tom won't load on kernels without TCP_OFFLOAD and this module's dependency * on t4_tom ensures that it won't either. So we don't directly check for * TCP_OFFLOAD here. */ static int c4iw_modevent(module_t mod, int cmd, void *arg) { int rc = 0; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = c4iw_mod_load(); if (rc == 0) printf("iw_cxgbe: Chelsio T4/T5 RDMA driver loaded.\n"); break; case MOD_UNLOAD: rc = c4iw_mod_unload(); break; default: rc = EINVAL; } #else printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t c4iw_mod_data = { "iw_cxgbe", c4iw_modevent, 0 }; MODULE_VERSION(iw_cxgbe, 1); MODULE_DEPEND(iw_cxgbe, t4nex, 1, 1, 1); MODULE_DEPEND(iw_cxgbe, t4_tom, 1, 1, 1); MODULE_DEPEND(iw_cxgbe, ibcore, 1, 1, 1); +MODULE_DEPEND(iw_cxgbe, linuxapi, 1, 1, 1); DECLARE_MODULE(iw_cxgbe, c4iw_mod_data, SI_SUB_EXEC, SI_ORDER_ANY); Index: head/sys/ofed/drivers/infiniband/core/device.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/device.c (revision 277401) +++ head/sys/ofed/drivers/infiniband/core/device.c (revision 277402) @@ -1,772 +1,770 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); #ifdef __ia64__ /* workaround for a bug in hp chipset that would cause kernel panic when dma resources are exhaused */ int dma_map_sg_hp_wa = 0; #endif struct ib_client_data { struct list_head list; struct ib_client *client; void * data; }; static LIST_HEAD(device_list); static LIST_HEAD(client_list); /* * device_mutex protects access to both device_list and client_list. * There's no real point to using multiple locks or something fancier * like an rwsem: we always access both lists, and we're always * modifying one list or the other list. In any case this is not a * hot path so there's no point in trying to optimize. */ static DEFINE_MUTEX(device_mutex); static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(query_gid), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_ah), IB_MANDATORY_FUNC(destroy_ah), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(dereg_mr) }; int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) { printk(KERN_WARNING "Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); return -EINVAL; } } return 0; } static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; list_for_each_entry(device, &device_list, core_list) if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) return device; return NULL; } static int alloc_name(char *name) { unsigned long *inuse; char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); if (!inuse) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { if (!sscanf(device->name, name, &i)) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); snprintf(buf, sizeof buf, name, i); if (__ib_device_get_by_name(buf)) return -ENFILE; strlcpy(name, buf, IB_DEVICE_NAME_MAX); return 0; } static int start_port(struct ib_device *device) { return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; } static int end_port(struct ib_device *device) { return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : device->phys_port_cnt; } /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *ib_alloc_device(size_t size) { BUG_ON(size < sizeof (struct ib_device)); return kzalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->reg_state == IB_DEV_UNINITIALIZED) { kfree(device); return; } BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); kobject_put(&device->dev.kobj); } EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; unsigned long flags; context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", device->name, client->name); return -ENOMEM; } context->client = client; context->data = NULL; spin_lock_irqsave(&device->client_data_lock, flags); list_add(&context->list, &device->client_data_list); spin_unlock_irqrestore(&device->client_data_lock, flags); return 0; } static int read_port_table_lengths(struct ib_device *device) { struct ib_port_attr *tprops = NULL; int num_ports, ret = -ENOMEM; u8 port_index; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) goto out; num_ports = end_port(device) - start_port(device) + 1; device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, GFP_KERNEL); device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, GFP_KERNEL); if (!device->pkey_tbl_len || !device->gid_tbl_len) goto err; for (port_index = 0; port_index < num_ports; ++port_index) { ret = ib_query_port(device, port_index + start_port(device), tprops); if (ret) goto err; device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; device->gid_tbl_len[port_index] = tprops->gid_tbl_len; } ret = 0; goto out; err: kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); out: kfree(tprops); return ret; } /** * ib_register_device - Register an IB device with IB core * @device:Device to register * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) { int ret; mutex_lock(&device_mutex); if (strchr(device->name, '%')) { ret = alloc_name(device->name); if (ret) goto out; } if (ib_device_check_mandatory(device)) { ret = -EINVAL; goto out; } INIT_LIST_HEAD(&device->event_handler_list); INIT_LIST_HEAD(&device->client_data_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); device->ib_uverbs_xrcd_table = RB_ROOT; mutex_init(&device->xrcd_table_mutex); ret = read_port_table_lengths(device); if (ret) { printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", device->name); goto out; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); goto out; } list_add_tail(&device->core_list, &device_list); device->reg_state = IB_DEV_REGISTERED; { struct ib_client *client; list_for_each_entry(client, &client_list, list) if (client->add && !add_client_context(device, client)) client->add(device); } out: mutex_unlock(&device_mutex); return ret; } EXPORT_SYMBOL(ib_register_device); /** * ib_unregister_device - Unregister an IB device * @device:Device to unregister * * Unregister an IB device. All clients will receive a remove callback. */ void ib_unregister_device(struct ib_device *device) { struct ib_client *client; struct ib_client_data *context, *tmp; unsigned long flags; mutex_lock(&device_mutex); list_for_each_entry_reverse(client, &client_list, list) if (client->remove) client->remove(device); list_del(&device->core_list); kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); mutex_unlock(&device_mutex); ib_device_unregister_sysfs(device); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); device->reg_state = IB_DEV_UNREGISTERED; } EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; mutex_lock(&device_mutex); list_add_tail(&client->list, &client_list); list_for_each_entry(device, &device_list, core_list) if (client->add && !add_client_context(device, client)) client->add(device); mutex_unlock(&device_mutex); return 0; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. */ void ib_unregister_client(struct ib_client *client) { struct ib_client_data *context, *tmp; struct ib_device *device; unsigned long flags; mutex_lock(&device_mutex); list_for_each_entry(device, &device_list, core_list) { if (client->remove) client->remove(device); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) if (context->client == client) { list_del(&context->list); kfree(context); } spin_unlock_irqrestore(&device->client_data_lock, flags); } list_del(&client->list); mutex_unlock(&device_mutex); } EXPORT_SYMBOL(ib_unregister_client); /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns client context set with * ib_set_client_data(). */ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; void *ret = NULL; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } spin_unlock_irqrestore(&device->client_data_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_client_data); /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context that can be retrieved with * ib_get_client_data(). */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { struct ib_client_data *context; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; goto out; } printk(KERN_WARNING "No client context found for %s/%s\n", device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ int ib_register_event_handler (struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ int ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(struct ib_event *event) { unsigned long flags; struct ib_event_handler *handler; spin_lock_irqsave(&event->device->event_handler_lock, flags); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); spin_unlock_irqrestore(&event->device->event_handler_lock, flags); } EXPORT_SYMBOL(ib_dispatch_event); /** * ib_query_device - Query IB device attributes * @device:Device to query * @device_attr:Device attributes * * ib_query_device() returns the attributes of a device through the * @device_attr pointer. */ int ib_query_device(struct ib_device *device, struct ib_device_attr *device_attr) { return device->query_device(device, device_attr); } EXPORT_SYMBOL(ib_query_device); /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; return device->query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); /** * ib_query_gid - Get GID table entry * @device:Device to query * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid) { return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey) { return device->query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { return device->modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, port_modify); } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = start_port(device); port <= end_port(device); ++port) { for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { ret = ib_query_gid(device, port, i, &tmp_gid); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { *index = i; return 0; } } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); static int __init ib_core_init(void) { int ret; #ifdef __ia64__ if (ia64_platform_is("hpzx1")) dma_map_sg_hp_wa = 1; #endif ret = ib_sysfs_setup(); if (ret) printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); ret = ib_cache_setup(); if (ret) { printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); ib_sysfs_cleanup(); } return ret; } static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); ib_sysfs_cleanup(); /* Make sure that any pending umem accounting work is done. */ flush_scheduled_work(); } module_init(ib_core_init); module_exit(ib_core_cleanup); -#undef MODULE_VERSION -#include static int ibcore_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ibcore_mod = { .name = "ibcore", .evhand = ibcore_evhand, }; MODULE_VERSION(ibcore, 1); MODULE_DEPEND(ibcore, linuxapi, 1, 1, 1); DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_SMP, SI_ORDER_ANY); Index: head/sys/ofed/drivers/infiniband/hw/mlx4/main.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 277401) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 277402) @@ -1,2423 +1,2423 @@ /* * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #ifdef __linux__ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include "user.h" #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME #define DRV_VERSION "1.0" #define DRV_RELDATE "April 4, 2008" #define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib" #define MLX4_IB_MRS_PROC_DIR_NAME "mrs" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); +#ifdef __linux__ MODULE_VERSION(DRV_VERSION); +#endif int mlx4_ib_sm_guid_assign = 1; #ifdef __linux__ struct proc_dir_entry *mlx4_mrs_dir_entry; static struct proc_dir_entry *mlx4_ib_driver_dir_entry; #endif module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); static char dev_assign_str[512]; //module_param_string(dev_assign_str, dev_assign_str, sizeof(dev_assign_str), 0644); MODULE_PARM_DESC(dev_assign_str, "Map all device function numbers to " "IB device numbers following the pattern: " "bb:dd.f-0,bb:dd.f-1,... (all numbers are hexadecimals)." " Max supported devices - 32"); static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; struct update_gid_work { struct work_struct work; union ib_gid gids[128]; struct mlx4_ib_dev *dev; int port; }; struct dev_rec { int bus; int dev; int func; int nr; }; #define MAX_DR 32 static struct dev_rec dr[MAX_DR]; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static union ib_gid zgid; static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memset(props, 0, sizeof *props); props->fw_ver = dev->dev->caps.fw_ver; props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK | IB_DEVICE_SHARED_MR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; props->device_cap_flags |= IB_DEVICE_QPG; if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { props->device_cap_flags |= IB_DEVICE_UD_RSS; props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz; } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; out: kfree(in_mad); kfree(out_mad); return err; } static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx4_dev *dev = to_mdev(device)->dev; return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } static int ib_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int ext_active_speed; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); props->sm_sl = out_mad->data[36] & 0xf; props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); if (netw_view) props->gid_tbl_len = out_mad->data[50]; else props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; props->active_speed = out_mad->data[35] >> 4; props->max_mtu = out_mad->data[41] & 0xf; props->active_mtu = out_mad->data[36] >> 4; props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; /* Check if extended speeds (EDR/FDR/...) are supported */ if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { ext_active_speed = out_mad->data[62] >> 4; switch (ext_active_speed) { case 1: props->active_speed = IB_SPEED_FDR; break; case 2: props->active_speed = IB_SPEED_EDR; break; } } /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == IB_SPEED_QDR) { init_query_mad(in_mad); in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; /* Checking LinkSpeedActive for FDR-10 */ if (out_mad->data[15] & 0x1) props->active_speed = IB_SPEED_FDR10; } /* Avoid wrong speed value returned by FW if the IB link is down. */ if (props->state == IB_PORT_DOWN) props->active_speed = IB_SPEED_SDR; out: kfree(in_mad); kfree(out_mad); return err; } static u8 state_to_phys_state(enum ib_port_state state) { return state == IB_PORT_ACTIVE ? 5 : 3; } static int eth_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); struct mlx4_ib_iboe *iboe = &mdev->iboe; struct net_device *ndev; enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; props->port_cap_flags = IB_PORT_CM_SUP; if (netw_view) props->gid_tbl_len = MLX4_ROCE_MAX_GIDS; else props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; props->max_mtu = IB_MTU_4096; props->max_vl_num = 2; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock(&iboe->lock); ndev = iboe->netdevs[port - 1]; if (!ndev) goto out_unlock; tmp = iboe_get_mtu(ndev->if_mtu); props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: spin_unlock(&iboe->lock); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { int err; memset(props, 0, sizeof *props); err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? ib_link_query_port(ibdev, port, props, netw_view) : eth_link_query_port(ibdev, port, props, netw_view); return err; } static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { /* returns host view */ return __mlx4_ib_query_port(ibdev, port, props, 0); } int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; struct mlx4_ib_dev *dev = to_mdev(ibdev); int clear = 0; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(dev->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); if (mlx4_is_mfunc(dev->dev) && !netw_view) { if (index) { /* For any index > 0, return the null guid */ err = 0; clear = 1; goto out; } } init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: if (clear) memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; } static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); *gid = dev->iboe.gid_table[port - 1][index]; return 0; } static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); else return iboe_query_gid(ibdev, port, index, gid); } int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); out: kfree(in_mad); kfree(out_mad); return err; } static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); } static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; if (mlx4_is_slave(to_mdev(ibdev)->dev)) return -EOPNOTSUPP; spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); memcpy(ibdev->node_desc, props->node_desc, 64); spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) return 0; memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, 256); if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); } else { ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct ib_port_attr attr; u32 cap_mask; int err; mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) goto out; cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx4_SET_PORT(to_mdev(ibdev), port, !!(mask & IB_PORT_RESET_QKEY_CNTR), cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); return err; } static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp_v3.bf_reg_size = 0; resp_v3.bf_regs_per_page = 0; } } else { resp.dev_caps = dev->dev->caps.userspace_caps; resp.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp.bf_reg_size = 0; resp.bf_regs_per_page = 0; } resp.cqe_size = dev->dev->caps.cqe_size; } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); if (err) { kfree(context); return ERR_PTR(err); } INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); return ERR_PTR(-EFAULT); } return &context->ibucontext; } static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); kfree(context); return 0; } #ifdef __linux__ static unsigned long mlx4_ib_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm; struct vm_area_struct *vma; unsigned long start_addr; unsigned long page_size_order; unsigned long command; mm = current->mm; if (addr) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); /* Last 8 bits hold the command others are data per that command */ command = pgoff & MLX4_IB_MMAP_CMD_MASK; if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS; /* code is based on the huge-pages get_unmapped_area code */ start_addr = mm->free_area_cache; if (len <= mm->cached_hole_size) start_addr = TASK_UNMAPPED_BASE; full_search: addr = ALIGN(start_addr, 1 << page_size_order); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (TASK_SIZE - len < addr) { /* * Start a new search - just in case we missed * some holes. */ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = TASK_UNMAPPED_BASE; goto full_search; } return -ENOMEM; } if (!vma || addr + len <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, 1 << page_size_order); } } #endif static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); int err; /* Last 8 bits hold the command others are data per that command */ unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK; if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { /* compatability handling for commands 0 & 1*/ if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; } if (command == MLX4_IB_MMAP_UAR_PAGE) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE && dev->dev->caps.bf_reg_size != 0) { vma->vm_page_prot = pgprot_wc(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { /* Getting contiguous physical pages */ unsigned long total_size = vma->vm_end - vma->vm_start; unsigned long page_size_order = (vma->vm_pgoff) >> MLX4_IB_MMAP_CMD_BITS; struct ib_cmem *ib_cmem; ib_cmem = ib_cmem_alloc_contiguous_pages(context, total_size, page_size_order); if (IS_ERR(ib_cmem)) { err = PTR_ERR(ib_cmem); return err; } err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma); if (err) { ib_cmem_release_contiguous_pages(ib_cmem); return err; } return 0; } else return -EINVAL; return 0; } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_pd *pd; int err; pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } return &pd->ibpd; } static int mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); kfree(pd); return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_xrcd *xrcd; int err; if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); if (err) goto err1; xrcd->pd = ib_alloc_pd(ibdev); if (IS_ERR(xrcd->pd)) { err = PTR_ERR(xrcd->pd); goto err2; } xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); if (IS_ERR(xrcd->cq)) { err = PTR_ERR(xrcd->cq); goto err3; } return &xrcd->ibxrcd; err3: ib_dealloc_pd(xrcd->pd); err2: mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); err1: kfree(xrcd); return ERR_PTR(err); } static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { ib_destroy_cq(to_mxrcd(xrcd)->cq); ib_dealloc_pd(to_mxrcd(xrcd)->pd); mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); kfree(xrcd); return 0; } static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) return -ENOMEM; ge->gid = *gid; if (mlx4_ib_add_mc(mdev, mqp, gid)) { ge->port = mqp->port; ge->added = 1; } mutex_lock(&mqp->mutex); list_add_tail(&ge->list, &mqp->gid_list); mutex_unlock(&mqp->mutex); return 0; } int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { u8 mac[6]; struct net_device *ndev; int ret = 0; if (!mqp->port) return 0; spin_lock(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); if (ndev) { rdma_get_mcast_mac((struct in6_addr *)gid, mac); rtnl_lock(); dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0); ret = 1; rtnl_unlock(); dev_put(ndev); } return ret; } struct mlx4_ib_steering { struct list_head list; u64 reg_id; union ib_gid gid; }; static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); if (!ib_steering) return -ENOMEM; } err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), MLX4_PROT_IB_IPV6, ®_id); if (err) goto err_malloc; err = add_gid_entry(ibqp, gid); if (err) goto err_add; if (ib_steering) { memcpy(ib_steering->gid.raw, gid->raw, 16); ib_steering->reg_id = reg_id; mutex_lock(&mqp->mutex); list_add(&ib_steering->list, &mqp->steering_rules); mutex_unlock(&mqp->mutex); } return 0; err_add: mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6, reg_id); err_malloc: kfree(ib_steering); return err; } enum { IBV_FLOW_L4_NONE = 0, IBV_FLOW_L4_OTHER = 3, IBV_FLOW_L4_UDP = 5, IBV_FLOW_L4_TCP = 6 }; struct mlx4_cm_steering { struct list_head list; u64 reg_id; struct ib_flow_spec spec; }; static int flow_spec_to_net_rule(struct ib_device *dev, struct ib_flow_spec *flow_spec, struct list_head *rule_list_h) { struct mlx4_spec_list *spec_l2, *spec_l3, *spec_l4; u64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL); if (!spec_l2) return -ENOMEM; switch (flow_spec->type) { case IB_FLOW_ETH: spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH; memcpy(spec_l2->eth.dst_mac, flow_spec->l2_id.eth.mac, ETH_ALEN); memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN); spec_l2->eth.ether_type = flow_spec->l2_id.eth.ethertype; if (flow_spec->l2_id.eth.vlan_present) { spec_l2->eth.vlan_id = flow_spec->l2_id.eth.vlan; spec_l2->eth.vlan_id_msk = cpu_to_be16(0x0fff); } break; case IB_FLOW_IB_UC: spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; if(flow_spec->l2_id.ib_uc.qpn) { spec_l2->ib.l3_qpn = cpu_to_be32(flow_spec->l2_id.ib_uc.qpn); spec_l2->ib.qpn_msk = cpu_to_be32(0xffffff); } break; case IB_FLOW_IB_MC_IPV4: case IB_FLOW_IB_MC_IPV6: spec_l2->id = MLX4_NET_TRANS_RULE_ID_IB; memcpy(spec_l2->ib.dst_gid, flow_spec->l2_id.ib_mc.mgid, 16); memset(spec_l2->ib.dst_gid_msk, 0xff, 16); break; } list_add_tail(&spec_l2->list, rule_list_h); if (flow_spec->l2_id.eth.ethertype == cpu_to_be16(ETH_P_IP) || flow_spec->type != IB_FLOW_ETH) { spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL); if (!spec_l3) return -ENOMEM; spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4; spec_l3->ipv4.src_ip = flow_spec->src_ip; if (flow_spec->type != IB_FLOW_IB_MC_IPV4 && flow_spec->type != IB_FLOW_IB_MC_IPV6) spec_l3->ipv4.dst_ip = flow_spec->dst_ip; if (spec_l3->ipv4.src_ip) spec_l3->ipv4.src_ip_msk = MLX4_BE_WORD_MASK; if (spec_l3->ipv4.dst_ip) spec_l3->ipv4.dst_ip_msk = MLX4_BE_WORD_MASK; list_add_tail(&spec_l3->list, rule_list_h); } if (flow_spec->l4_protocol) { spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL); if (!spec_l4) return -ENOMEM; spec_l4->tcp_udp.src_port = flow_spec->src_port; spec_l4->tcp_udp.dst_port = flow_spec->dst_port; if (spec_l4->tcp_udp.src_port) spec_l4->tcp_udp.src_port_msk = MLX4_BE_SHORT_MASK; if (spec_l4->tcp_udp.dst_port) spec_l4->tcp_udp.dst_port_msk = MLX4_BE_SHORT_MASK; switch (flow_spec->l4_protocol) { case IBV_FLOW_L4_UDP: spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP; break; case IBV_FLOW_L4_TCP: spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP; break; default: dev_err(dev->dma_device, "Unsupported l4 protocol.\n"); kfree(spec_l4); return -EPROTONOSUPPORT; } list_add_tail(&spec_l4->list, rule_list_h); } return 0; } static int __mlx4_ib_flow_attach(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, struct ib_flow_spec *flow_spec, int priority, int lock_qp) { u64 reg_id = 0; int err = 0; struct mlx4_cm_steering *cm_flow; struct mlx4_spec_list *spec, *tmp_spec; struct mlx4_net_trans_rule rule = { .queue_mode = MLX4_NET_TRANS_Q_FIFO, .exclusive = 0, }; rule.promisc_mode = flow_spec->rule_type; rule.port = mqp->port; rule.qpn = mqp->mqp.qpn; INIT_LIST_HEAD(&rule.list); cm_flow = kmalloc(sizeof(*cm_flow), GFP_KERNEL); if (!cm_flow) return -ENOMEM; if (rule.promisc_mode == MLX4_FS_REGULAR) { rule.allow_loopback = !flow_spec->block_mc_loopback; rule.priority = MLX4_DOMAIN_UVERBS | priority; err = flow_spec_to_net_rule(&mdev->ib_dev, flow_spec, &rule.list); if (err) goto free_list; } err = mlx4_flow_attach(mdev->dev, &rule, ®_id); if (err) goto free_list; memcpy(&cm_flow->spec, flow_spec, sizeof(*flow_spec)); cm_flow->reg_id = reg_id; if (lock_qp) mutex_lock(&mqp->mutex); list_add(&cm_flow->list, &mqp->rules_list); if (lock_qp) mutex_unlock(&mqp->mutex); free_list: list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) { list_del(&spec->list); kfree(spec); } if (err) { kfree(cm_flow); dev_err(mdev->ib_dev.dma_device, "Fail to attach flow steering rule\n"); } return err; } static int __mlx4_ib_flow_detach(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, struct ib_flow_spec *spec, int priority, int lock_qp) { struct mlx4_cm_steering *cm_flow; int ret; if (lock_qp) mutex_lock(&mqp->mutex); list_for_each_entry(cm_flow, &mqp->rules_list, list) { if (!memcmp(&cm_flow->spec, spec, sizeof(*spec))) { list_del(&cm_flow->list); break; } } if (lock_qp) mutex_unlock(&mqp->mutex); if (&cm_flow->list == &mqp->rules_list) { dev_err(mdev->ib_dev.dma_device, "Couldn't find reg_id for flow spec. " "Steering rule is left attached\n"); return -EINVAL; } ret = mlx4_flow_detach(mdev->dev, cm_flow->reg_id); kfree(cm_flow); return ret; } static int mlx4_ib_flow_attach(struct ib_qp *qp, struct ib_flow_spec *flow_spec, int priority) { return __mlx4_ib_flow_attach(to_mdev(qp->device), to_mqp(qp), flow_spec, priority, 1); } static int mlx4_ib_flow_detach(struct ib_qp *qp, struct ib_flow_spec *spec, int priority) { return __mlx4_ib_flow_detach(to_mdev(qp->device), to_mqp(qp), spec, priority, 1); } static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { struct mlx4_ib_gid_entry *ge; struct mlx4_ib_gid_entry *tmp; struct mlx4_ib_gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { ret = ge; break; } } return ret; } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); u8 mac[6]; struct net_device *ndev; struct mlx4_ib_gid_entry *ge; u64 reg_id = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { struct mlx4_ib_steering *ib_steering; mutex_lock(&mqp->mutex); list_for_each_entry(ib_steering, &mqp->steering_rules, list) { if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) { list_del(&ib_steering->list); break; } } mutex_unlock(&mqp->mutex); if (&ib_steering->list == &mqp->steering_rules) { pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n"); return -EINVAL; } reg_id = ib_steering->reg_id; kfree(ib_steering); } err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6, reg_id); if (err) return err; mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { spin_lock(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); rdma_get_mcast_mac((struct in6_addr *)gid, mac); if (ndev) { rtnl_lock(); dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0); rtnl_unlock(); dev_put(ndev); } list_del(&ge->list); kfree(ge); } else pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); return 0; } static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; if (mlx4_is_master(dev->dev)) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(dev->ib_dev.node_desc, out_mad->data, 64); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id }; static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) { #ifdef __linux__ memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); #else memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); #endif if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; } else { eui[3] = 0xff; eui[4] = 0xfe; } eui[0] ^= 2; } static void update_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); return; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof gw->gids); err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port command failed\n"); else { memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids); mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); } mlx4_free_cmd_mailbox(dev, mailbox); kfree(gw); } static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) { struct net_device *ndev = dev->iboe.netdevs[port - 1]; struct update_gid_work *work; struct net_device *tmp; int i; u8 *hits; union ib_gid gid; int index_free; int found; int need_update = 0; int max_gids; u16 vid; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; hits = kzalloc(128, GFP_ATOMIC); if (!hits) { kfree(work); return -ENOMEM; } max_gids = dev->dev->caps.gid_table_len[port]; #ifdef __linux__ rcu_read_lock(); for_each_netdev_rcu(&init_net, tmp) { #else IFNET_RLOCK(); TAILQ_FOREACH(tmp, &V_ifnet, if_link) { #endif if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); vid = rdma_vlan_dev_vlan_id(tmp); mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); found = 0; index_free = -1; for (i = 0; i < max_gids; ++i) { if (index_free < 0 && !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) index_free = i; if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) { hits[i] = 1; found = 1; break; } } if (!found) { if (tmp == ndev && (memcmp(&dev->iboe.gid_table[port - 1][0], &gid, sizeof gid) || !memcmp(&dev->iboe.gid_table[port - 1][0], &zgid, sizeof gid))) { dev->iboe.gid_table[port - 1][0] = gid; ++need_update; hits[0] = 1; } else if (index_free >= 0) { dev->iboe.gid_table[port - 1][index_free] = gid; hits[index_free] = 1; ++need_update; } } } #ifdef __linux__ } rcu_read_unlock(); #else } IFNET_RUNLOCK(); #endif for (i = 0; i < max_gids; ++i) if (!hits[i]) { if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid)) ++need_update; dev->iboe.gid_table[port - 1][i] = zgid; } if (need_update) { memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids); INIT_WORK(&work->work, update_gids_task); work->port = port; work->dev = dev; queue_work(wq, &work->work); } else kfree(work); kfree(hits); return 0; } static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event) { switch (event) { case NETDEV_UP: #ifdef __linux__ case NETDEV_CHANGEADDR: #endif update_ipv6_gids(dev, port, 0); break; case NETDEV_DOWN: update_ipv6_gids(dev, port, 1); dev->iboe.netdevs[port - 1] = NULL; } } static void netdev_added(struct mlx4_ib_dev *dev, int port) { update_ipv6_gids(dev, port, 0); } static void netdev_removed(struct mlx4_ib_dev *dev, int port) { update_ipv6_gids(dev, port, 1); } static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; struct net_device *oldnd; struct mlx4_ib_iboe *iboe; int port; #ifdef __linux__ if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; #endif ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); iboe = &ibdev->iboe; spin_lock(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { oldnd = iboe->netdevs[port - 1]; iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (oldnd != iboe->netdevs[port - 1]) { if (iboe->netdevs[port - 1]) netdev_added(ibdev, port); else netdev_removed(ibdev, port); } } if (dev == iboe->netdevs[0] || (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) handle_en_event(ibdev, 1, event); else if (dev == iboe->netdevs[1] || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) handle_en_event(ibdev, 2, event); spin_unlock(&iboe->lock); return NOTIFY_DONE; } static void init_pkeys(struct mlx4_ib_dev *ibdev) { int port; int slave; int i; if (mlx4_is_master(ibdev->dev)) { for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) { ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = /* master has the identity virt2phys pkey mapping */ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; mlx4_sync_pkey_table(ibdev->dev, slave, port, i, ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); } } } /* initialize pkey cache */ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) ibdev->pkeys.phys_pkey_cache[port-1][i] = (i) ? 0 : 0xFFFF; } } } static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { char name[32]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; int i, j, eq; /* Legacy mode or comp_pool is not large enough */ if (dev->caps.comp_pool == 0 || dev->caps.num_ports > dev->caps.comp_pool) return; eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ dev->caps.num_ports); /* Init eq table */ added_eqs = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) added_eqs += eq_per_port; total_eqs = dev->caps.num_comp_vectors + added_eqs; ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); if (!ibdev->eq_table) return; ibdev->eq_added = added_eqs; eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j, pci_get_domain(dev->pdev->dev.bsddev), pci_get_bus(dev->pdev->dev.bsddev), PCI_SLOT(dev->pdev->devfn), PCI_FUNC(dev->pdev->devfn)); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, &ibdev->eq_table[eq])) { /* Use legacy (same as mlx4_en driver) */ pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); ibdev->eq_table[eq] = (eq % dev->caps.num_comp_vectors); } eq++; } } /* Fill the reset of the vector with legacy EQ */ for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) ibdev->eq_table[eq++] = i; /* Advertise the new number of EQs to clients */ ibdev->ib_dev.num_comp_vectors = total_eqs; } static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; /* no additional eqs were added */ if (!ibdev->eq_table) return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; /* Free only the added eqs */ for (i = 0; i < ibdev->eq_added; i++) { /* Don't free legacy eqs if used */ if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) continue; mlx4_release_eq(dev, ibdev->eq_table[i]); } kfree(ibdev->eq_table); } /* * create show function and a device_attribute struct pointing to * the function for _name */ #define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ static ssize_t show_rprt_##_name(struct device *dev, \ struct device_attribute *attr, \ char *buf){ \ return show_diag_rprt(dev, buf, _offset, _op_mod); \ } \ static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); #define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 static size_t show_diag_rprt(struct device *device, char *buf, u32 offset, u8 op_modifier) { size_t ret; u32 counter_offset = offset; u32 diag_counter = 0; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, &counter_offset, &diag_counter); if (ret) return ret; return sprintf(buf, "%d\n", diag_counter); } static ssize_t clear_diag_counters(struct device *device, struct device_attribute *attr, const char *buf, size_t length) { size_t ret; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, NULL, NULL); if (ret) return ret; return length; } DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters); static struct attribute *diag_rprt_attrs[] = { &dev_attr_rq_num_lle.attr, &dev_attr_sq_num_lle.attr, &dev_attr_rq_num_lqpoe.attr, &dev_attr_sq_num_lqpoe.attr, &dev_attr_rq_num_lpe.attr, &dev_attr_sq_num_lpe.attr, &dev_attr_rq_num_wrfe.attr, &dev_attr_sq_num_wrfe.attr, &dev_attr_sq_num_mwbe.attr, &dev_attr_sq_num_bre.attr, &dev_attr_rq_num_lae.attr, &dev_attr_sq_num_rire.attr, &dev_attr_rq_num_rire.attr, &dev_attr_sq_num_rae.attr, &dev_attr_rq_num_rae.attr, &dev_attr_sq_num_roe.attr, &dev_attr_sq_num_tree.attr, &dev_attr_sq_num_rree.attr, &dev_attr_rq_num_rnr.attr, &dev_attr_sq_num_rnr.attr, &dev_attr_rq_num_oos.attr, &dev_attr_sq_num_oos.attr, &dev_attr_rq_num_mce.attr, &dev_attr_rq_num_udsdprd.attr, &dev_attr_rq_num_ucsdprd.attr, &dev_attr_num_cqovf.attr, &dev_attr_num_eqovf.attr, &dev_attr_num_baddb.attr, &dev_attr_clear_diag.attr, NULL }; static struct attribute_group diag_counters_group = { .name = "diag_counters", .attrs = diag_rprt_attrs }; #ifdef __linux__ static int mlx4_ib_proc_init(void) { /* Creating procfs directories /proc/drivers/mlx4_ib/ && /proc/drivers/mlx4_ib/mrs for further use by the driver. */ int err; mlx4_ib_driver_dir_entry = proc_mkdir(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); if (!mlx4_ib_driver_dir_entry) { pr_err("mlx4_ib_proc_init has failed for %s\n", MLX4_IB_DRIVER_PROC_DIR_NAME); err = -ENODEV; goto error; } mlx4_mrs_dir_entry = proc_mkdir(MLX4_IB_MRS_PROC_DIR_NAME, mlx4_ib_driver_dir_entry); if (!mlx4_mrs_dir_entry) { pr_err("mlx4_ib_proc_init has failed for %s\n", MLX4_IB_MRS_PROC_DIR_NAME); err = -ENODEV; goto remove_entry; } return 0; remove_entry: remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); error: return err; } #endif static void init_dev_assign(void) { int bus, slot, fn, ib_idx; char *p = dev_assign_str, *t; char curr_val[32] = {0}; int ret; int j, i = 0; memset(dr, 0, sizeof dr); if (dev_assign_str[0] == 0) return; while (strlen(p)) { ret = sscanf(p, "%02x:%02x.%x-%x", &bus, &slot, &fn, &ib_idx); if (ret != 4 || ib_idx < 0) goto err; for (j = 0; j < i; j++) if (dr[j].nr == ib_idx) goto err; dr[i].bus = bus; dr[i].dev = slot; dr[i].func = fn; dr[i].nr = ib_idx; t = strchr(p, ','); sprintf(curr_val, "%02x:%02x.%x-%x", bus, slot, fn, ib_idx); if ((!t) && strlen(p) == strlen(curr_val)) return; if (!t || (t + 1) >= dev_assign_str + sizeof dev_assign_str) goto err; ++i; if (i >= MAX_DR) goto err; p = t + 1; } return; err: memset(dr, 0, sizeof dr); printk(KERN_WARNING "mlx4_ib: The value of 'dev_assign_str' parameter " "is incorrect. The parameter value is discarded!"); } static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; int num_ports = 0; int i, j; int err; struct mlx4_ib_iboe *iboe; printk(KERN_INFO "%s", mlx4_ib_version); mlx4_foreach_ib_transport_port(i, dev) num_ports++; /* No point in registering a device with no ports... */ if (num_ports == 0) return NULL; ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); return NULL; } iboe = &ibdev->iboe; if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) goto err_dealloc; if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!ibdev->priv_uar.map) goto err_uar; MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; if (dev->caps.userspace_caps) ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; else ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_FLOW) | (1ull << IB_USER_VERBS_CMD_DETACH_FLOW) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; ibdev->ib_dev.query_gid = mlx4_ib_query_gid; ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; ibdev->ib_dev.modify_device = mlx4_ib_modify_device; ibdev->ib_dev.modify_port = mlx4_ib_modify_port; ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; ibdev->ib_dev.mmap = mlx4_ib_mmap; #ifdef __linux__ ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area; #endif ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; ibdev->ib_dev.create_ah = mlx4_ib_create_ah; ibdev->ib_dev.query_ah = mlx4_ib_query_ah; ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; ibdev->ib_dev.create_srq = mlx4_ib_create_srq; ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; ibdev->ib_dev.query_srq = mlx4_ib_query_srq; ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; ibdev->ib_dev.create_qp = mlx4_ib_create_qp; ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; ibdev->ib_dev.query_qp = mlx4_ib_query_qp; ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; ibdev->ib_dev.post_send = mlx4_ib_post_send; ibdev->ib_dev.post_recv = mlx4_ib_post_recv; ibdev->ib_dev.create_cq = mlx4_ib_create_cq; ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.attach_flow = mlx4_ib_flow_attach; ibdev->ib_dev.detach_flow = mlx4_ib_flow_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { err = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i]); if (err) ibdev->counters[i] = -1; } else ibdev->counters[i] = -1; } spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && !mlx4_is_slave(dev)) { ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); if (err) goto err_counter; ibdev->ib_uc_qpns_bitmap = kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * sizeof(long), GFP_KERNEL); if (!ibdev->ib_uc_qpns_bitmap) { dev_err(&dev->pdev->dev, "bit map alloc failed\n"); goto err_steer_qp_release; } bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1); if (err) goto err_steer_free_bitmap; } if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; if (mlx4_ib_init_sriov(ibdev)) goto err_mad; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); if (err) goto err_sriov; } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j])) goto err_notif; } if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) goto err_notif; ibdev->ib_active = true; if (mlx4_is_mfunc(ibdev->dev)) init_pkeys(ibdev); /* create paravirt contexts for any VFs which are active */ if (mlx4_is_master(ibdev->dev)) { for (j = 0; j < MLX4_MFUNC_MAX; j++) { if (j == mlx4_master_func_num(ibdev->dev)) continue; if (mlx4_is_slave_active(ibdev->dev, j)) do_slave_init(ibdev, j, 1); } } return ibdev; err_notif: if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); flush_workqueue(wq); err_sriov: mlx4_ib_close_sriov(ibdev); err_mad: mlx4_ib_mad_cleanup(ibdev); err_reg: ib_unregister_device(&ibdev->ib_dev); err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); err_counter: for (; i; --i) if (ibdev->counters[i - 1] != -1) mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1]); err_map: iounmap(ibdev->priv_uar.map); mlx4_ib_free_eqs(dev, ibdev); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); err_pd: mlx4_pd_free(dev, ibdev->priv_pdn); err_dealloc: ib_dealloc_device(&ibdev->ib_dev); return NULL; } int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) { int offset; WARN_ON(!dev->ib_uc_qpns_bitmap); offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, dev->steer_qpn_count, get_count_order(count)); if (offset < 0) return offset; *qpn = dev->steer_qpn_base + offset; return 0; } void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) { if (!qpn || dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) return; BUG_ON(qpn < dev->steer_qpn_base); bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, get_count_order(count)); } int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach) { struct ib_flow_spec spec = { .type = IB_FLOW_IB_UC, .l2_id.ib_uc.qpn = mqp->ibqp.qp_num, }; return is_attach ? __mlx4_ib_flow_attach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0) : __mlx4_ib_flow_detach(mdev, mqp, &spec, MLX4_DOMAIN_NIC, 0); } static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p,j; mlx4_ib_close_sriov(ibdev); sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); mlx4_ib_mad_cleanup(ibdev); for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } ib_unregister_device(&ibdev->ib_dev); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); kfree(ibdev->ib_uc_qpns_bitmap); } if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } iounmap(ibdev->priv_uar.map); for (p = 0; p < ibdev->num_ports; ++p) if (ibdev->counters[p] != -1) mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p]); mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); mlx4_ib_free_eqs(dev, ibdev); mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); } static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) { struct mlx4_ib_demux_work **dm = NULL; struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; if (!mlx4_is_master(dev)) return; dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); goto out; } for (i = 0; i < dev->caps.num_ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); for (i = 0; i < dev->caps.num_ports; i++) { if (dm[i]) kfree(dm[i]); } goto out; } } /* initialize or tear down tunnel QPs for the slave */ for (i = 0; i < dev->caps.num_ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); dm[i]->port = i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); if (!ibdev->sriov.is_going_down) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } out: if (dm) kfree(dm); return; } static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); struct mlx4_eqe *eqe = NULL; struct ib_event_work *ew; int p = 0; if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) eqe = (struct mlx4_eqe *)param; else p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: if (p > ibdev->num_ports) return; if (mlx4_is_master(dev) && rdma_port_get_link_layer(&ibdev->ib_dev, p) == IB_LINK_LAYER_INFINIBAND) { mlx4_ib_invalidate_all_guid_record(ibdev, p); } mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is up\n", p); ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: if (p > ibdev->num_ports) return; mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is down\n", p); ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) { pr_err("failed to allocate memory for events work\n"); break; } INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); ew->ib_dev = ibdev; /* need to queue only for port owner, which uses GEN_EQE */ if (mlx4_is_master(dev)) queue_work(wq, &ew->work); else handle_port_mgmt_change_event(&ew->work); return; case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; default: return; } ibev.device = ibdev_ptr; ibev.element.port_num = (u8) p; ib_dispatch_event(&ibev); } static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, .protocol = MLX4_PROT_IB_IPV6 }; static int __init mlx4_ib_init(void) { int err; wq = create_singlethread_workqueue("mlx4_ib"); if (!wq) return -ENOMEM; #ifdef __linux__ err = mlx4_ib_proc_init(); if (err) goto clean_wq; #endif err = mlx4_ib_mcg_init(); if (err) goto clean_proc; init_dev_assign(); err = mlx4_register_interface(&mlx4_ib_interface); if (err) goto clean_mcg; return 0; clean_mcg: mlx4_ib_mcg_destroy(); clean_proc: #ifdef __linux__ remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, mlx4_ib_driver_dir_entry); remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); clean_wq: #endif destroy_workqueue(wq); return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); destroy_workqueue(wq); /* Remove proc entries */ #ifdef __linux__ remove_proc_entry(MLX4_IB_MRS_PROC_DIR_NAME, mlx4_ib_driver_dir_entry); remove_proc_entry(MLX4_IB_DRIVER_PROC_DIR_NAME, NULL); #endif } module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); module_exit(mlx4_ib_cleanup); -#undef MODULE_VERSION -#include static int mlx4ib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlx4ib_mod = { .name = "mlx4ib", .evhand = mlx4ib_evhand, }; DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); MODULE_DEPEND(mlx4ib, linuxapi, 1, 1, 1); Index: head/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c (revision 277401) +++ head/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c (revision 277402) @@ -1,1359 +1,1361 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include "mthca_dev.h" #include "mthca_config_reg.h" #include "mthca_cmd.h" #include "mthca_profile.h" #include "mthca_memfree.h" #include "mthca_wqe.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_VERSION(DRV_VERSION); +MODULE_VERSION(mthca, 1); +MODULE_DEPEND(mthca, linuxapi, 1, 1, 1); +MODULE_DEPEND(mthca, ibcore, 1, 1, 1); #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG int mthca_debug_level = 0; module_param_named(debug_level, mthca_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */ #ifdef CONFIG_PCI_MSI static int msi_x = 1; module_param(msi_x, int, 0444); MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); #else /* CONFIG_PCI_MSI */ #define msi_x (0) #endif /* CONFIG_PCI_MSI */ static int tune_pci = 0; module_param(tune_pci, int, 0444); MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); DEFINE_MUTEX(mthca_device_mutex); #define MTHCA_DEFAULT_NUM_QP (1 << 16) #define MTHCA_DEFAULT_RDB_PER_QP (1 << 2) #define MTHCA_DEFAULT_NUM_CQ (1 << 16) #define MTHCA_DEFAULT_NUM_MCG (1 << 13) #define MTHCA_DEFAULT_NUM_MPT (1 << 17) #define MTHCA_DEFAULT_NUM_MTT (1 << 20) #define MTHCA_DEFAULT_NUM_UDAV (1 << 15) #define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18) #define MTHCA_DEFAULT_NUM_UARC_SIZE (1 << 18) static struct mthca_profile hca_profile = { .num_qp = MTHCA_DEFAULT_NUM_QP, .rdb_per_qp = MTHCA_DEFAULT_RDB_PER_QP, .num_cq = MTHCA_DEFAULT_NUM_CQ, .num_mcg = MTHCA_DEFAULT_NUM_MCG, .num_mpt = MTHCA_DEFAULT_NUM_MPT, .num_mtt = MTHCA_DEFAULT_NUM_MTT, .num_udav = MTHCA_DEFAULT_NUM_UDAV, /* Tavor only */ .fmr_reserved_mtts = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */ .uarc_size = MTHCA_DEFAULT_NUM_UARC_SIZE, /* Arbel only */ }; module_param_named(num_qp, hca_profile.num_qp, int, 0444); MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA"); module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444); MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP"); module_param_named(num_cq, hca_profile.num_cq, int, 0444); MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA"); module_param_named(num_mcg, hca_profile.num_mcg, int, 0444); MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA"); module_param_named(num_mpt, hca_profile.num_mpt, int, 0444); MODULE_PARM_DESC(num_mpt, "maximum number of memory protection table entries per HCA"); module_param_named(num_mtt, hca_profile.num_mtt, int, 0444); MODULE_PARM_DESC(num_mtt, "maximum number of memory translation table segments per HCA"); module_param_named(num_udav, hca_profile.num_udav, int, 0444); MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA"); module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444); MODULE_PARM_DESC(fmr_reserved_mtts, "number of memory translation table segments reserved for FMR"); static int log_mtts_per_seg; module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)"); static char mthca_version[] __devinitdata = DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; static int mthca_tune_pci(struct mthca_dev *mdev) { if (!tune_pci) return 0; /* First try to max out Read Byte Count */ if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) { if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) { mthca_err(mdev, "Couldn't set PCI-X max read count, " "aborting.\n"); return -ENODEV; } } else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) mthca_info(mdev, "No PCI-X capability, not setting RBC.\n"); if (pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP)) { if (pcie_set_readrq(mdev->pdev, 4096)) { mthca_err(mdev, "Couldn't write PCI Express read request, " "aborting.\n"); return -ENODEV; } } else if (mdev->mthca_flags & MTHCA_FLAG_PCIE) mthca_info(mdev, "No PCI Express capability, " "not setting Max Read Request Size.\n"); return 0; } static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim) { int err; u8 status; mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8; err = mthca_QUERY_DEV_LIM(mdev, dev_lim, &status); if (err) { mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "QUERY_DEV_LIM returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } if (dev_lim->min_page_sz > PAGE_SIZE) { mthca_err(mdev, "HCA minimum page size of %d bigger than " "kernel PAGE_SIZE of %d, aborting.\n", dev_lim->min_page_sz, PAGE_SIZE); return -ENODEV; } if (dev_lim->num_ports > MTHCA_MAX_PORTS) { mthca_err(mdev, "HCA has %d ports, but we only support %d, " "aborting.\n", dev_lim->num_ports, MTHCA_MAX_PORTS); return -ENODEV; } if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) { mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than " "PCI resource 2 size of 0x%llx, aborting.\n", dev_lim->uar_size, (unsigned long long)pci_resource_len(mdev->pdev, 2)); return -ENODEV; } mdev->limits.num_ports = dev_lim->num_ports; mdev->limits.vl_cap = dev_lim->max_vl; mdev->limits.mtu_cap = dev_lim->max_mtu; mdev->limits.gid_table_len = dev_lim->max_gids; mdev->limits.pkey_table_len = dev_lim->max_pkeys; mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay; /* * Need to allow for worst case send WQE overhead and check * whether max_desc_sz imposes a lower limit than max_sg; UD * send has the biggest overhead. */ mdev->limits.max_sg = min_t(int, dev_lim->max_sg, (dev_lim->max_desc_sz - sizeof (struct mthca_next_seg) - (mthca_is_memfree(mdev) ? sizeof (struct mthca_arbel_ud_seg) : sizeof (struct mthca_tavor_ud_seg))) / sizeof (struct mthca_data_seg)); mdev->limits.max_wqes = dev_lim->max_qp_sz; mdev->limits.max_qp_init_rdma = dev_lim->max_requester_per_qp; mdev->limits.reserved_qps = dev_lim->reserved_qps; mdev->limits.max_srq_wqes = dev_lim->max_srq_sz; mdev->limits.reserved_srqs = dev_lim->reserved_srqs; mdev->limits.reserved_eecs = dev_lim->reserved_eecs; mdev->limits.max_desc_sz = dev_lim->max_desc_sz; mdev->limits.max_srq_sge = mthca_max_srq_sge(mdev); /* * Subtract 1 from the limit because we need to allocate a * spare CQE so the HCA HW can tell the difference between an * empty CQ and a full CQ. */ mdev->limits.max_cqes = dev_lim->max_cq_sz - 1; mdev->limits.reserved_cqs = dev_lim->reserved_cqs; mdev->limits.reserved_eqs = dev_lim->reserved_eqs; mdev->limits.reserved_mtts = dev_lim->reserved_mtts; mdev->limits.reserved_mrws = dev_lim->reserved_mrws; mdev->limits.reserved_uars = dev_lim->reserved_uars; mdev->limits.reserved_pds = dev_lim->reserved_pds; mdev->limits.port_width_cap = dev_lim->max_port_width; mdev->limits.page_size_cap = ~(u32) (dev_lim->min_page_sz - 1); mdev->limits.flags = dev_lim->flags; /* * For old FW that doesn't return static rate support, use a * value of 0x3 (only static rate values of 0 or 1 are handled), * except on Sinai, where even old FW can handle static rate * values of 2 and 3. */ if (dev_lim->stat_rate_support) mdev->limits.stat_rate_support = dev_lim->stat_rate_support; else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) mdev->limits.stat_rate_support = 0xf; else mdev->limits.stat_rate_support = 0x3; /* IB_DEVICE_RESIZE_MAX_WR not supported by driver. May be doable since hardware supports it for SRQ. IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver. IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not supported by driver. */ mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR) mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR) mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI) mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI; if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG) mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE) mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev_lim->flags & DEV_LIM_FLAG_SRQ) mdev->mthca_flags |= MTHCA_FLAG_SRQ; if (mthca_is_memfree(mdev)) if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM) mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; return 0; } static int mthca_init_tavor(struct mthca_dev *mdev) { s64 size; u8 status; int err; struct mthca_dev_lim dev_lim; struct mthca_profile profile; struct mthca_init_hca_param init_hca; err = mthca_SYS_EN(mdev, &status); if (err) { mthca_err(mdev, "SYS_EN command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "SYS_EN returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } err = mthca_QUERY_FW(mdev, &status); if (err) { mthca_err(mdev, "QUERY_FW command failed, aborting.\n"); goto err_disable; } if (status) { mthca_err(mdev, "QUERY_FW returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_disable; } err = mthca_QUERY_DDR(mdev, &status); if (err) { mthca_err(mdev, "QUERY_DDR command failed, aborting.\n"); goto err_disable; } if (status) { mthca_err(mdev, "QUERY_DDR returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_disable; } err = mthca_dev_lim(mdev, &dev_lim); if (err) { mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n"); goto err_disable; } profile = hca_profile; profile.num_uar = dev_lim.uar_size / PAGE_SIZE; profile.uarc_size = 0; if (mdev->mthca_flags & MTHCA_FLAG_SRQ) profile.num_srq = dev_lim.max_srqs; size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); if (size < 0) { err = size; goto err_disable; } err = mthca_INIT_HCA(mdev, &init_hca, &status); if (err) { mthca_err(mdev, "INIT_HCA command failed, aborting.\n"); goto err_disable; } if (status) { mthca_err(mdev, "INIT_HCA returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_disable; } return 0; err_disable: mthca_SYS_DIS(mdev, &status); return err; } static int mthca_load_fw(struct mthca_dev *mdev) { u8 status; int err; /* FIXME: use HCA-attached memory for FW if present */ mdev->fw.arbel.fw_icm = mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.fw_icm) { mthca_err(mdev, "Couldn't allocate FW area, aborting.\n"); return -ENOMEM; } err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm, &status); if (err) { mthca_err(mdev, "MAP_FA command failed, aborting.\n"); goto err_free; } if (status) { mthca_err(mdev, "MAP_FA returned status 0x%02x, aborting.\n", status); err = -EINVAL; goto err_free; } err = mthca_RUN_FW(mdev, &status); if (err) { mthca_err(mdev, "RUN_FW command failed, aborting.\n"); goto err_unmap_fa; } if (status) { mthca_err(mdev, "RUN_FW returned status 0x%02x, aborting.\n", status); err = -EINVAL; goto err_unmap_fa; } return 0; err_unmap_fa: mthca_UNMAP_FA(mdev, &status); err_free: mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); return err; } static int mthca_init_icm(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim, struct mthca_init_hca_param *init_hca, u64 icm_size) { u64 aux_pages; u8 status; int err; err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages, &status); if (err) { mthca_err(mdev, "SET_ICM_SIZE command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "SET_ICM_SIZE returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n", (unsigned long long) icm_size >> 10, (unsigned long long) aux_pages << 2); mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.aux_icm) { mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n"); return -ENOMEM; } err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm, &status); if (err) { mthca_err(mdev, "MAP_ICM_AUX command failed, aborting.\n"); goto err_free_aux; } if (status) { mthca_err(mdev, "MAP_ICM_AUX returned status 0x%02x, aborting.\n", status); err = -EINVAL; goto err_free_aux; } err = mthca_map_eq_icm(mdev, init_hca->eqc_base); if (err) { mthca_err(mdev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_aux; } /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */ mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size, dma_get_cache_alignment()) / mdev->limits.mtt_seg_size; mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, mdev->limits.mtt_seg_size, mdev->limits.num_mtt_segs, mdev->limits.reserved_mtts, 1, 0); if (!mdev->mr_table.mtt_table) { mthca_err(mdev, "Failed to map MTT context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_eq; } mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base, dev_lim->mpt_entry_sz, mdev->limits.num_mpts, mdev->limits.reserved_mrws, 1, 1); if (!mdev->mr_table.mpt_table) { mthca_err(mdev, "Failed to map MPT context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_mtt; } mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base, dev_lim->qpc_entry_sz, mdev->limits.num_qps, mdev->limits.reserved_qps, 0, 0); if (!mdev->qp_table.qp_table) { mthca_err(mdev, "Failed to map QP context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_mpt; } mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base, dev_lim->eqpc_entry_sz, mdev->limits.num_qps, mdev->limits.reserved_qps, 0, 0); if (!mdev->qp_table.eqp_table) { mthca_err(mdev, "Failed to map EQP context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_qp; } mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base, MTHCA_RDB_ENTRY_SIZE, mdev->limits.num_qps << mdev->qp_table.rdb_shift, 0, 0, 0); if (!mdev->qp_table.rdb_table) { mthca_err(mdev, "Failed to map RDB context memory, aborting\n"); err = -ENOMEM; goto err_unmap_eqp; } mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base, dev_lim->cqc_entry_sz, mdev->limits.num_cqs, mdev->limits.reserved_cqs, 0, 0); if (!mdev->cq_table.table) { mthca_err(mdev, "Failed to map CQ context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_rdb; } if (mdev->mthca_flags & MTHCA_FLAG_SRQ) { mdev->srq_table.table = mthca_alloc_icm_table(mdev, init_hca->srqc_base, dev_lim->srq_entry_sz, mdev->limits.num_srqs, mdev->limits.reserved_srqs, 0, 0); if (!mdev->srq_table.table) { mthca_err(mdev, "Failed to map SRQ context memory, " "aborting.\n"); err = -ENOMEM; goto err_unmap_cq; } } /* * It's not strictly required, but for simplicity just map the * whole multicast group table now. The table isn't very big * and it's a lot easier than trying to track ref counts. */ mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base, MTHCA_MGM_ENTRY_SIZE, mdev->limits.num_mgms + mdev->limits.num_amgms, mdev->limits.num_mgms + mdev->limits.num_amgms, 0, 0); if (!mdev->mcg_table.table) { mthca_err(mdev, "Failed to map MCG context memory, aborting.\n"); err = -ENOMEM; goto err_unmap_srq; } return 0; err_unmap_srq: if (mdev->mthca_flags & MTHCA_FLAG_SRQ) mthca_free_icm_table(mdev, mdev->srq_table.table); err_unmap_cq: mthca_free_icm_table(mdev, mdev->cq_table.table); err_unmap_rdb: mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); err_unmap_eqp: mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); err_unmap_qp: mthca_free_icm_table(mdev, mdev->qp_table.qp_table); err_unmap_mpt: mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); err_unmap_mtt: mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); err_unmap_eq: mthca_unmap_eq_icm(mdev); err_unmap_aux: mthca_UNMAP_ICM_AUX(mdev, &status); err_free_aux: mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); return err; } static void mthca_free_icms(struct mthca_dev *mdev) { u8 status; mthca_free_icm_table(mdev, mdev->mcg_table.table); if (mdev->mthca_flags & MTHCA_FLAG_SRQ) mthca_free_icm_table(mdev, mdev->srq_table.table); mthca_free_icm_table(mdev, mdev->cq_table.table); mthca_free_icm_table(mdev, mdev->qp_table.rdb_table); mthca_free_icm_table(mdev, mdev->qp_table.eqp_table); mthca_free_icm_table(mdev, mdev->qp_table.qp_table); mthca_free_icm_table(mdev, mdev->mr_table.mpt_table); mthca_free_icm_table(mdev, mdev->mr_table.mtt_table); mthca_unmap_eq_icm(mdev); mthca_UNMAP_ICM_AUX(mdev, &status); mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); } static int mthca_init_arbel(struct mthca_dev *mdev) { struct mthca_dev_lim dev_lim; struct mthca_profile profile; struct mthca_init_hca_param init_hca; s64 icm_size; u8 status; int err; err = mthca_QUERY_FW(mdev, &status); if (err) { mthca_err(mdev, "QUERY_FW command failed, aborting.\n"); return err; } if (status) { mthca_err(mdev, "QUERY_FW returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } err = mthca_ENABLE_LAM(mdev, &status); if (err) { mthca_err(mdev, "ENABLE_LAM command failed, aborting.\n"); return err; } if (status == MTHCA_CMD_STAT_LAM_NOT_PRE) { mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n"); mdev->mthca_flags |= MTHCA_FLAG_NO_LAM; } else if (status) { mthca_err(mdev, "ENABLE_LAM returned status 0x%02x, " "aborting.\n", status); return -EINVAL; } err = mthca_load_fw(mdev); if (err) { mthca_err(mdev, "Failed to start FW, aborting.\n"); goto err_disable; } err = mthca_dev_lim(mdev, &dev_lim); if (err) { mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n"); goto err_stop_fw; } profile = hca_profile; profile.num_uar = dev_lim.uar_size / PAGE_SIZE; profile.num_udav = 0; if (mdev->mthca_flags & MTHCA_FLAG_SRQ) profile.num_srq = dev_lim.max_srqs; icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca); if (icm_size < 0) { err = icm_size; goto err_stop_fw; } err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size); if (err) goto err_stop_fw; err = mthca_INIT_HCA(mdev, &init_hca, &status); if (err) { mthca_err(mdev, "INIT_HCA command failed, aborting.\n"); goto err_free_icm; } if (status) { mthca_err(mdev, "INIT_HCA returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_free_icm; } return 0; err_free_icm: mthca_free_icms(mdev); err_stop_fw: mthca_UNMAP_FA(mdev, &status); mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); err_disable: if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) mthca_DISABLE_LAM(mdev, &status); return err; } static void mthca_close_hca(struct mthca_dev *mdev) { u8 status; mthca_CLOSE_HCA(mdev, 0, &status); if (mthca_is_memfree(mdev)) { mthca_free_icms(mdev); mthca_UNMAP_FA(mdev, &status); mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) mthca_DISABLE_LAM(mdev, &status); } else mthca_SYS_DIS(mdev, &status); } static int mthca_init_hca(struct mthca_dev *mdev) { u8 status; int err; struct mthca_adapter adapter; if (mthca_is_memfree(mdev)) err = mthca_init_arbel(mdev); else err = mthca_init_tavor(mdev); if (err) return err; err = mthca_QUERY_ADAPTER(mdev, &adapter, &status); if (err) { mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n"); goto err_close; } if (status) { mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, " "aborting.\n", status); err = -EINVAL; goto err_close; } mdev->eq_table.inta_pin = adapter.inta_pin; if (!mthca_is_memfree(mdev)) mdev->rev_id = adapter.revision_id; memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id); return 0; err_close: mthca_close_hca(mdev); return err; } static int mthca_setup_hca(struct mthca_dev *dev) { int err; u8 status; MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock); err = mthca_init_uar_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "user access region table, aborting.\n"); return err; } err = mthca_uar_alloc(dev, &dev->driver_uar); if (err) { mthca_err(dev, "Failed to allocate driver access region, " "aborting.\n"); goto err_uar_table_free; } dev->kar = ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!dev->kar) { mthca_err(dev, "Couldn't map kernel access region, " "aborting.\n"); err = -ENOMEM; goto err_uar_free; } err = mthca_init_pd_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "protection domain table, aborting.\n"); goto err_kar_unmap; } err = mthca_init_mr_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "memory region table, aborting.\n"); goto err_pd_table_free; } err = mthca_pd_alloc(dev, 1, &dev->driver_pd); if (err) { mthca_err(dev, "Failed to create driver PD, " "aborting.\n"); goto err_mr_table_free; } err = mthca_init_eq_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "event queue table, aborting.\n"); goto err_pd_free; } err = mthca_cmd_use_events(dev); if (err) { mthca_err(dev, "Failed to switch to event-driven " "firmware commands, aborting.\n"); goto err_eq_table_free; } err = mthca_NOP(dev, &status); if (err || status) { if (dev->mthca_flags & MTHCA_FLAG_MSI_X) { mthca_warn(dev, "NOP command failed to generate interrupt " "(IRQ %d).\n", dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector); mthca_warn(dev, "Trying again with MSI-X disabled.\n"); } else { mthca_err(dev, "NOP command failed to generate interrupt " "(IRQ %d), aborting.\n", dev->pdev->irq); mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n"); } goto err_cmd_poll; } mthca_dbg(dev, "NOP command IRQ test passed\n"); err = mthca_init_cq_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "completion queue table, aborting.\n"); goto err_cmd_poll; } err = mthca_init_srq_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "shared receive queue table, aborting.\n"); goto err_cq_table_free; } err = mthca_init_qp_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "queue pair table, aborting.\n"); goto err_srq_table_free; } err = mthca_init_av_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "address vector table, aborting.\n"); goto err_qp_table_free; } err = mthca_init_mcg_table(dev); if (err) { mthca_err(dev, "Failed to initialize " "multicast group table, aborting.\n"); goto err_av_table_free; } return 0; err_av_table_free: mthca_cleanup_av_table(dev); err_qp_table_free: mthca_cleanup_qp_table(dev); err_srq_table_free: mthca_cleanup_srq_table(dev); err_cq_table_free: mthca_cleanup_cq_table(dev); err_cmd_poll: mthca_cmd_use_polling(dev); err_eq_table_free: mthca_cleanup_eq_table(dev); err_pd_free: mthca_pd_free(dev, &dev->driver_pd); err_mr_table_free: mthca_cleanup_mr_table(dev); err_pd_table_free: mthca_cleanup_pd_table(dev); err_kar_unmap: iounmap(dev->kar); err_uar_free: mthca_uar_free(dev, &dev->driver_uar); err_uar_table_free: mthca_cleanup_uar_table(dev); return err; } static int mthca_enable_msi_x(struct mthca_dev *mdev) { struct msix_entry entries[3]; int err; entries[0].entry = 0; entries[1].entry = 1; entries[2].entry = 2; err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries)); if (err) { if (err > 0) mthca_info(mdev, "Only %d MSI-X vectors available, " "not using MSI-X\n", err); return err; } mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector; mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector; mdev->eq_table.eq[MTHCA_EQ_CMD ].msi_x_vector = entries[2].vector; return 0; } /* Types of supported HCA */ enum { TAVOR, /* MT23108 */ ARBEL_COMPAT, /* MT25208 in Tavor compat mode */ ARBEL_NATIVE, /* MT25208 with extended features */ SINAI /* MT25204 */ }; #define MTHCA_FW_VER(major, minor, subminor) \ (((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor)) static struct { u64 latest_fw; u32 flags; } mthca_hca_table[] = { [TAVOR] = { .latest_fw = MTHCA_FW_VER(3, 5, 0), .flags = 0 }, [ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200), .flags = MTHCA_FLAG_PCIE }, [ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0), .flags = MTHCA_FLAG_MEMFREE | MTHCA_FLAG_PCIE }, [SINAI] = { .latest_fw = MTHCA_FW_VER(1, 2, 0), .flags = MTHCA_FLAG_MEMFREE | MTHCA_FLAG_PCIE | MTHCA_FLAG_SINAI_OPT } }; static int __mthca_init_one(struct pci_dev *pdev, int hca_type) { int ddr_hidden = 0; int err; struct mthca_dev *mdev; printk(KERN_INFO PFX "Initializing %s\n", pci_name(pdev)); err = pci_enable_device(pdev); if (err) { dev_err(&pdev->dev, "Cannot enable PCI device, " "aborting.\n"); return err; } /* * Check for BARs. We expect 0: 1MB, 2: 8MB, 4: DDR (may not * be present) */ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) || pci_resource_len(pdev, 0) != 1 << 20) { dev_err(&pdev->dev, "Missing DCS, aborting.\n"); err = -ENODEV; goto err_disable_pdev; } if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { dev_err(&pdev->dev, "Missing UAR, aborting.\n"); err = -ENODEV; goto err_disable_pdev; } if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM)) ddr_hidden = 1; err = pci_request_regions(pdev, DRV_NAME); if (err) { dev_err(&pdev->dev, "Cannot obtain PCI resources, " "aborting.\n"); goto err_disable_pdev; } pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n"); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); goto err_free_res; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit " "consistent PCI DMA mask.\n"); err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); goto err_free_res; } } mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev); if (!mdev) { dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; goto err_free_res; } mdev->pdev = pdev; mdev->mthca_flags = mthca_hca_table[hca_type].flags; if (ddr_hidden) mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left * the HCA in an undefined state. */ err = mthca_reset(mdev); if (err) { mthca_err(mdev, "Failed to reset HCA, aborting.\n"); goto err_free_dev; } if (mthca_cmd_init(mdev)) { mthca_err(mdev, "Failed to init command interface, aborting.\n"); goto err_free_dev; } err = mthca_tune_pci(mdev); if (err) goto err_cmd; err = mthca_init_hca(mdev); if (err) goto err_cmd; if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n", (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, (int) (mdev->fw_ver & 0xffff), (int) (mthca_hca_table[hca_type].latest_fw >> 32), (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff, (int) (mthca_hca_table[hca_type].latest_fw & 0xffff)); mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); } if (msi_x && !mthca_enable_msi_x(mdev)) mdev->mthca_flags |= MTHCA_FLAG_MSI_X; err = mthca_setup_hca(mdev); if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) { if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) pci_disable_msix(pdev); mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X; err = mthca_setup_hca(mdev); } if (err) goto err_close; err = mthca_register_device(mdev); if (err) goto err_cleanup; err = mthca_create_agents(mdev); if (err) goto err_unregister; pci_set_drvdata(pdev, mdev); mdev->hca_type = hca_type; mdev->active = 1; return 0; err_unregister: mthca_unregister_device(mdev); err_cleanup: mthca_cleanup_mcg_table(mdev); mthca_cleanup_av_table(mdev); mthca_cleanup_qp_table(mdev); mthca_cleanup_srq_table(mdev); mthca_cleanup_cq_table(mdev); mthca_cmd_use_polling(mdev); mthca_cleanup_eq_table(mdev); mthca_pd_free(mdev, &mdev->driver_pd); mthca_cleanup_mr_table(mdev); mthca_cleanup_pd_table(mdev); mthca_cleanup_uar_table(mdev); err_close: if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) pci_disable_msix(pdev); mthca_close_hca(mdev); err_cmd: mthca_cmd_cleanup(mdev); err_free_dev: ib_dealloc_device(&mdev->ib_dev); err_free_res: pci_release_regions(pdev); err_disable_pdev: pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); return err; } static void __mthca_remove_one(struct pci_dev *pdev) { struct mthca_dev *mdev = pci_get_drvdata(pdev); u8 status; int p; if (mdev) { mthca_free_agents(mdev); mthca_unregister_device(mdev); for (p = 1; p <= mdev->limits.num_ports; ++p) mthca_CLOSE_IB(mdev, p, &status); mthca_cleanup_mcg_table(mdev); mthca_cleanup_av_table(mdev); mthca_cleanup_qp_table(mdev); mthca_cleanup_srq_table(mdev); mthca_cleanup_cq_table(mdev); mthca_cmd_use_polling(mdev); mthca_cleanup_eq_table(mdev); mthca_pd_free(mdev, &mdev->driver_pd); mthca_cleanup_mr_table(mdev); mthca_cleanup_pd_table(mdev); iounmap(mdev->kar); mthca_uar_free(mdev, &mdev->driver_uar); mthca_cleanup_uar_table(mdev); mthca_close_hca(mdev); mthca_cmd_cleanup(mdev); if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) pci_disable_msix(pdev); ib_dealloc_device(&mdev->ib_dev); pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } } int __mthca_restart_one(struct pci_dev *pdev) { struct mthca_dev *mdev; int hca_type; mdev = pci_get_drvdata(pdev); if (!mdev) return -ENODEV; hca_type = mdev->hca_type; __mthca_remove_one(pdev); return __mthca_init_one(pdev, hca_type); } static int __devinit mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { static int mthca_version_printed = 0; int ret; mutex_lock(&mthca_device_mutex); if (!mthca_version_printed) { printk(KERN_INFO "%s", mthca_version); ++mthca_version_printed; } if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { printk(KERN_ERR PFX "%s has invalid driver data %jx\n", pci_name(pdev), (uintmax_t)id->driver_data); mutex_unlock(&mthca_device_mutex); return -ENODEV; } ret = __mthca_init_one(pdev, id->driver_data); mutex_unlock(&mthca_device_mutex); return ret; } static void __devexit mthca_remove_one(struct pci_dev *pdev) { mutex_lock(&mthca_device_mutex); __mthca_remove_one(pdev); mutex_unlock(&mthca_device_mutex); } static struct pci_device_id mthca_pci_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), .driver_data = TAVOR }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR), .driver_data = TAVOR }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), .driver_data = ARBEL_COMPAT }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT), .driver_data = ARBEL_COMPAT }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL), .driver_data = ARBEL_NATIVE }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL), .driver_data = ARBEL_NATIVE }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI), .driver_data = SINAI }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI), .driver_data = SINAI }, { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), .driver_data = SINAI }, { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD), .driver_data = SINAI }, { 0, } }; MODULE_DEVICE_TABLE(pci, mthca_pci_table); static struct pci_driver mthca_driver = { .name = DRV_NAME, .id_table = mthca_pci_table, .probe = mthca_init_one, .remove = __devexit_p(mthca_remove_one) }; static void __init __mthca_check_profile_val(const char *name, int *pval, int pval_default) { /* value must be positive and power of 2 */ int old_pval = *pval; if (old_pval <= 0) *pval = pval_default; else *pval = roundup_pow_of_two(old_pval); if (old_pval != *pval) { printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n", old_pval, name); printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval); } } #define mthca_check_profile_val(name, default) \ __mthca_check_profile_val(#name, &hca_profile.name, default) static void __init mthca_validate_profile(void) { mthca_check_profile_val(num_qp, MTHCA_DEFAULT_NUM_QP); mthca_check_profile_val(rdb_per_qp, MTHCA_DEFAULT_RDB_PER_QP); mthca_check_profile_val(num_cq, MTHCA_DEFAULT_NUM_CQ); mthca_check_profile_val(num_mcg, MTHCA_DEFAULT_NUM_MCG); mthca_check_profile_val(num_mpt, MTHCA_DEFAULT_NUM_MPT); mthca_check_profile_val(num_mtt, MTHCA_DEFAULT_NUM_MTT); mthca_check_profile_val(num_udav, MTHCA_DEFAULT_NUM_UDAV); mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS); if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) { printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n", hca_profile.fmr_reserved_mtts); printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n", hca_profile.num_mtt); hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2; printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n", hca_profile.fmr_reserved_mtts); } if (log_mtts_per_seg == 0) log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) { printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n", log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8)); log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8); } } static int __init mthca_init(void) { int ret; mthca_validate_profile(); ret = mthca_catas_init(); if (ret) return ret; ret = pci_register_driver(&mthca_driver); if (ret < 0) { mthca_catas_cleanup(); return ret; } return 0; } static void __exit mthca_cleanup(void) { pci_unregister_driver(&mthca_driver); mthca_catas_cleanup(); } module_init_order(mthca_init, SI_ORDER_MIDDLE); module_exit(mthca_cleanup); Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 277401) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 277402) @@ -1,1545 +1,1543 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #include #include #include #include #include /* For ARPHRD_xxx */ #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void ipoib_input(struct ifnet *ifp, struct mbuf *m); #define IPOIB_MTAP(_ifp, _m) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_mb((_ifp), (_m)); \ } \ } while (0) /* * This is for clients that have an ipoib_header in the mbuf. */ static void ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) { struct ipoib_header *ih; struct ether_header eh; ih = mtod(mb, struct ipoib_header *); eh.ether_type = ih->proto; bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); bzero(&eh.ether_shost, ETHER_ADDR_LEN); mb->m_data += sizeof(struct ipoib_header); mb->m_len -= sizeof(struct ipoib_header); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(struct ipoib_header); mb->m_len += sizeof(struct ipoib_header); } void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) { struct ether_header eh; eh.ether_type = proto; bzero(&eh.ether_shost, ETHER_ADDR_LEN); bzero(&eh.ether_dhost, ETHER_ADDR_LEN); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); } static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu) { struct ifnet *dev = priv->dev; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); dev->if_mtu = new_mtu; return 0; } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, INFINIBAND_ALEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ error = -ipoib_change_mtu(priv, ifr->ifr_mtu); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { _IF_ENQUEUE(&path->queue, mb); if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; IPOIB_MTAP(dev, mb); ipoib_send_one(priv, mb); } spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { bpfdetach(dev); if_detach(dev); if_free(dev); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static volatile int ipoib_unit; static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1)); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; if_attach(dev); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = dev->if_addrlen; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr; int result = -ENOMEM; device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); if (!device_attr) { printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", hca->name, sizeof *device_attr); return result; } result = ib_query_device(hca, device_attr); if (result) { printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", hca->name, result); kfree(device_attr); return result; } priv->hca_caps = device_attr->device_cap_flags; kfree(device_attr); priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = ib_get_client_data(device, &ipoib_client); list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } /* * Infiniband output routine. */ static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_char edst[INFINIBAND_ALEN]; struct llentry *lle = NULL; struct rtentry *rt0 = NULL; struct ipoib_header *eh; int error = 0, is_gw = 0; short type; if (ro != NULL) { if (!(m->m_flags & (M_BCAST | M_MCAST))) lle = ro->ro_lle; rt0 = ro->ro_rt; if (rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) != 0) is_gw = 1; } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, &lle->ll_addr.mac8, sizeof(edst)); else if (m->m_flags & M_MCAST) ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); else error = arpresolve(ifp, is_gw, m, dst, edst, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); else bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, &lle->ll_addr.mac8, sizeof(edst)); else if (m->m_flags & M_MCAST) ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); else error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, NULL); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } eh = mtod(m, struct ipoib_header *); (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Upper layer processing for a received Infiniband packet. */ void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) { int isr; #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { if_printf(ifp, "discard frame at IFF_MONITOR\n"); m_freem(m); return; } /* * Dispatch frame to upper layer. */ switch (proto) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: m_freem(m); } /* * Process a received Infiniband packet. */ static void ipoib_input(struct ifnet *ifp, struct mbuf *m) { struct ipoib_header *eh; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } CURVNET_SET_QUIET(ifp->if_vnet); /* Let BPF have it before we strip the header. */ IPOIB_MTAP(ifp, m); eh = mtod(m, struct ipoib_header *); /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Infiniband header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, IPOIB_HEADER_LEN); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } ipoib_demux(ifp, m, ntohs(eh->proto)); CURVNET_RESTORE(); } static int ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!IPOIB_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = 0; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all * of the multicast address used for IP6. * This has no meaning in ipoib. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: return EAFNOSUPPORT; } } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); -#undef MODULE_VERSION -#include static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_SMP, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, linuxapi, 1, 1, 1); Index: head/sys/ofed/drivers/net/mlx4/en_main.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_main.c (revision 277401) +++ head/sys/ofed/drivers/net/mlx4/en_main.c (revision 277402) @@ -1,352 +1,352 @@ /* * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include #include #include #include #include #include "mlx4_en.h" MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin"); MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver"); MODULE_LICENSE("Dual BSD/GPL"); +#ifdef __linux__ MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")"); +#endif static const char mlx4_en_version[] = DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; #define MLX4_EN_PARM_INT(X, def_val, desc) \ static unsigned int X = def_val;\ module_param(X , uint, 0444); \ MODULE_PARM_DESC(X, desc); /* * Device scope module parameters */ /* Enable RSS UDP traffic */ MLX4_EN_PARM_INT(udp_rss, 1, "Enable RSS for incoming UDP traffic"); /* Priority pausing */ MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]." " Per priority bit mask"); MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]." " Per priority bit mask"); #define MAX_PFC_TX 0xff #define MAX_PFC_RX 0xff static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) { struct mlx4_en_profile *params = &mdev->profile; int i; params->udp_rss = udp_rss; params->num_tx_rings_p_up = min_t(int, mp_ncpus, MLX4_EN_MAX_TX_RING_P_UP); if (params->udp_rss && !(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) { mlx4_warn(mdev, "UDP RSS is not supported on this device.\n"); params->udp_rss = 0; } for (i = 1; i <= MLX4_MAX_PORTS; i++) { params->prof[i].rx_pause = 1; params->prof[i].rx_ppp = pfcrx; params->prof[i].tx_pause = 1; params->prof[i].tx_ppp = pfctx; params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE; params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE; params->prof[i].tx_ring_num = params->num_tx_rings_p_up * MLX4_EN_NUM_UP; params->prof[i].rss_rings = 0; } return 0; } static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) { struct mlx4_en_dev *endev = ctx; return endev->pndev[port]; } static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, enum mlx4_dev_event event, unsigned long port) { struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr; struct mlx4_en_priv *priv; switch (event) { case MLX4_DEV_EVENT_PORT_UP: case MLX4_DEV_EVENT_PORT_DOWN: if (!mdev->pndev[port]) return; priv = netdev_priv(mdev->pndev[port]); /* To prevent races, we poll the link state in a separate task rather than changing it here */ priv->link_state = event; queue_work(mdev->workqueue, &priv->linkstate_task); break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: mlx4_err(mdev, "Internal error detected, restarting device\n"); break; case MLX4_DEV_EVENT_SLAVE_INIT: case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: break; default: if (port < 1 || port > dev->caps.num_ports || !mdev->pndev[port]) return; mlx4_warn(mdev, "Unhandled event %d for port %d\n", event, (int) port); } } static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr) { struct mlx4_en_dev *mdev = endev_ptr; int i, ret; mutex_lock(&mdev->state_lock); mdev->device_up = false; mutex_unlock(&mdev->state_lock); mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) if (mdev->pndev[i]) mlx4_en_destroy_netdev(mdev->pndev[i]); flush_workqueue(mdev->workqueue); destroy_workqueue(mdev->workqueue); ret = mlx4_mr_free(dev, &mdev->mr); if (ret) mlx4_err(mdev, "Error deregistering MR. The system may have become unstable."); iounmap(mdev->uar_map); mlx4_uar_free(dev, &mdev->priv_uar); mlx4_pd_free(dev, mdev->priv_pdn); kfree(mdev); } static void *mlx4_en_add(struct mlx4_dev *dev) { struct mlx4_en_dev *mdev; int i; int err; printk_once(KERN_INFO "%s", mlx4_en_version); mdev = kzalloc(sizeof *mdev, GFP_KERNEL); if (!mdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; goto err_free_res; } if (mlx4_pd_alloc(dev, &mdev->priv_pdn)) goto err_free_dev; if (mlx4_uar_alloc(dev, &mdev->priv_uar)) goto err_pd; mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!mdev->uar_map) goto err_uar; spin_lock_init(&mdev->uar_lock); mdev->dev = dev; mdev->dma_device = &(dev->pdev->dev); mdev->pdev = dev->pdev; mdev->device_up = false; mdev->LSO_support = !!(dev->caps.flags & (1 << 15)); if (!mdev->LSO_support) mlx4_warn(mdev, "LSO not supported, please upgrade to later " "FW version to enable LSO\n"); if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull, MLX4_PERM_LOCAL_WRITE | MLX4_PERM_LOCAL_READ, 0, 0, &mdev->mr)) { mlx4_err(mdev, "Failed allocating memory region\n"); goto err_map; } if (mlx4_mr_enable(mdev->dev, &mdev->mr)) { mlx4_err(mdev, "Failed enabling memory region\n"); goto err_mr; } /* Build device profile according to supplied module parameters */ err = mlx4_en_get_profile(mdev); if (err) { mlx4_err(mdev, "Bad module parameters, aborting.\n"); goto err_mr; } /* Configure which ports to start according to module parameters */ mdev->port_cnt = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) mdev->port_cnt++; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { if (!dev->caps.comp_pool) { mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(max_t(int, MIN_RX_RINGS, min_t(int, dev->caps.num_comp_vectors, DEF_RX_RINGS))); } else { mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two( min_t(int, dev->caps.comp_pool/ dev->caps.num_ports - 1 , MAX_MSIX_P_PORT - 1)); } } /* Create our own workqueue for reset/multicast tasks * Note: we cannot use the shared workqueue because of deadlocks caused * by the rtnl lock */ mdev->workqueue = create_singlethread_workqueue("mlx4_en"); if (!mdev->workqueue) { err = -ENOMEM; goto err_mr; } /* At this stage all non-port specific tasks are complete: * mark the card state as up */ mutex_init(&mdev->state_lock); mdev->device_up = true; /* Setup ports */ /* Create a netdev for each port */ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { mlx4_info(mdev, "Activating port:%d\n", i); if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) mdev->pndev[i] = NULL; } return mdev; err_mr: err = mlx4_mr_free(dev, &mdev->mr); if (err) mlx4_err(mdev, "Error deregistering MR. The system may have become unstable."); err_map: if (mdev->uar_map) iounmap(mdev->uar_map); err_uar: mlx4_uar_free(dev, &mdev->priv_uar); err_pd: mlx4_pd_free(dev, mdev->priv_pdn); err_free_dev: kfree(mdev); err_free_res: return NULL; } static struct mlx4_interface mlx4_en_interface = { .add = mlx4_en_add, .remove = mlx4_en_remove, .event = mlx4_en_event, .get_dev = mlx4_en_get_netdev, .protocol = MLX4_PROT_ETH, }; static void mlx4_en_verify_params(void) { if (pfctx > MAX_PFC_TX) { pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - " "should be in range 0-0x%x, will be changed to default (0)\n", pfctx, MAX_PFC_TX); pfctx = 0; } if (pfcrx > MAX_PFC_RX) { pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - " "should be in range 0-0x%x, will be changed to default (0)\n", pfcrx, MAX_PFC_RX); pfcrx = 0; } } static int __init mlx4_en_init(void) { mlx4_en_verify_params(); #ifdef CONFIG_DEBUG_FS int err = 0; err = mlx4_en_register_debugfs(); if (err) pr_err(KERN_ERR "Failed to register debugfs\n"); #endif return mlx4_register_interface(&mlx4_en_interface); } static void __exit mlx4_en_cleanup(void) { mlx4_unregister_interface(&mlx4_en_interface); #ifdef CONFIG_DEBUG_FS mlx4_en_unregister_debugfs(); #endif } module_init(mlx4_en_init); module_exit(mlx4_en_cleanup); -#undef MODULE_VERSION -#include static int mlxen_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlxen_mod = { .name = "mlxen", .evhand = mlxen_evhand, }; DECLARE_MODULE(mlxen, mlxen_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY); MODULE_DEPEND(mlxen, mlx4, 1, 1, 1); MODULE_DEPEND(mlxen, linuxapi, 1, 1, 1); Index: head/sys/ofed/drivers/net/mlx4/main.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/main.c (revision 277401) +++ head/sys/ofed/drivers/net/mlx4/main.c (revision 277402) @@ -1,3800 +1,3795 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007, 2008, 2014 Mellanox Technologies. All rights reserved. * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include -/* - * kmod.h must be included before module.h since it includes (indirectly) sys/module.h - * To use the FBSD macro sys/module.h should define MODULE_VERSION before linux/module does. -*/ #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4.h" #include "fw.h" #include "icm.h" #include "mlx4_stats.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *mlx4_wq; #ifdef CONFIG_MLX4_DEBUG int mlx4_debug_level = 0; module_param_named(debug_level, mlx4_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif /* CONFIG_MLX4_DEBUG */ #ifdef CONFIG_PCI_MSI static int msi_x = 1; module_param(msi_x, int, 0444); MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number of MSI-X irqs to msi_x (non-SRIOV only)"); #else /* CONFIG_PCI_MSI */ #define msi_x (0) #endif /* CONFIG_PCI_MSI */ static int enable_sys_tune = 0; module_param(enable_sys_tune, int, 0444); MODULE_PARM_DESC(enable_sys_tune, "Tune the cpu's for better performance (default 0)"); int mlx4_blck_lb = 1; module_param_named(block_loopback, mlx4_blck_lb, int, 0644); MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0 " "(default: 1)"); enum { DEFAULT_DOMAIN = 0, BDF_STR_SIZE = 8, /* bb:dd.f- */ DBDF_STR_SIZE = 13 /* mmmm:bb:dd.f- */ }; enum { NUM_VFS, PROBE_VF, PORT_TYPE_ARRAY }; enum { VALID_DATA, INVALID_DATA, INVALID_STR }; struct param_data { int id; struct mlx4_dbdf2val_lst dbdf2val; }; static struct param_data num_vfs = { .id = NUM_VFS, .dbdf2val = { .name = "num_vfs param", .num_vals = 1, .def_val = {0}, .range = {0, MLX4_MAX_NUM_VF} } }; module_param_string(num_vfs, num_vfs.dbdf2val.str, sizeof(num_vfs.dbdf2val.str), 0444); MODULE_PARM_DESC(num_vfs, "Either single value (e.g. '5') to define uniform num_vfs value for all devices functions\n" "\t\tor a string to map device function numbers to their num_vfs values (e.g. '0000:04:00.0-5,002b:1c:0b.a-15').\n" "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for num_vfs value (e.g. 15)."); static struct param_data probe_vf = { .id = PROBE_VF, .dbdf2val = { .name = "probe_vf param", .num_vals = 1, .def_val = {0}, .range = {0, MLX4_MAX_NUM_VF} } }; module_param_string(probe_vf, probe_vf.dbdf2val.str, sizeof(probe_vf.dbdf2val.str), 0444); MODULE_PARM_DESC(probe_vf, "Either single value (e.g. '3') to define uniform number of VFs to probe by the pf driver for all devices functions\n" "\t\tor a string to map device function numbers to their probe_vf values (e.g. '0000:04:00.0-3,002b:1c:0b.a-13').\n" "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for probe_vf value (e.g. 13)."); int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; module_param_named(log_num_mgm_entry_size, mlx4_log_num_mgm_entry_size, int, 0444); MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num" " of qp per mcg, for example:" " 10 gives 248.range: 7 <=" " log_num_mgm_entry_size <= 12." " To activate device managed" " flow steering when available, set to -1"); static int high_rate_steer; module_param(high_rate_steer, int, 0444); MODULE_PARM_DESC(high_rate_steer, "Enable steering mode for higher packet rate" " (default off)"); static int fast_drop; module_param_named(fast_drop, fast_drop, int, 0444); MODULE_PARM_DESC(fast_drop, "Enable fast packet drop when no recieve WQEs are posted"); int mlx4_enable_64b_cqe_eqe = 1; module_param_named(enable_64b_cqe_eqe, mlx4_enable_64b_cqe_eqe, int, 0644); MODULE_PARM_DESC(enable_64b_cqe_eqe, "Enable 64 byte CQEs/EQEs when the the FW supports this if non-zero (default: 1)"); #define HCA_GLOBAL_CAP_MASK 0 #define PF_CONTEXT_BEHAVIOUR_MASK MLX4_FUNC_CAP_64B_EQE_CQE static char mlx4_version[] __devinitdata = DRV_NAME ": Mellanox ConnectX core driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; static int log_num_mac = 7; module_param_named(log_num_mac, log_num_mac, int, 0444); MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)"); static int log_num_vlan; module_param_named(log_num_vlan, log_num_vlan, int, 0444); MODULE_PARM_DESC(log_num_vlan, "(Obsolete) Log2 max number of VLANs per ETH port (0-7)"); /* Log2 max number of VLANs per ETH port (0-7) */ #define MLX4_LOG_NUM_VLANS 7 int log_mtts_per_seg = ilog2(1); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment " "(0-7) (default: 0)"); static struct param_data port_type_array = { .id = PORT_TYPE_ARRAY, .dbdf2val = { .name = "port_type_array param", .num_vals = 2, .def_val = {MLX4_PORT_TYPE_ETH, MLX4_PORT_TYPE_ETH}, .range = {MLX4_PORT_TYPE_IB, MLX4_PORT_TYPE_NA} } }; module_param_string(port_type_array, port_type_array.dbdf2val.str, sizeof(port_type_array.dbdf2val.str), 0444); MODULE_PARM_DESC(port_type_array, "Either pair of values (e.g. '1,2') to define uniform port1/port2 types configuration for all devices functions\n" "\t\tor a string to map device function numbers to their pair of port types values (e.g. '0000:04:00.0-1;2,002b:1c:0b.a-1;1').\n" "\t\tValid port types: 1-ib, 2-eth, 3-auto, 4-N/A\n" "\t\tIn case that only one port is available use the N/A port type for port2 (e.g '1,4')."); struct mlx4_port_config { struct list_head list; enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; struct pci_dev *pdev; }; #define MLX4_LOG_NUM_MTT 20 /* We limit to 30 as of a bit map issue which uses int and not uint. see mlx4_buddy_init -> bitmap_zero which gets int. */ #define MLX4_MAX_LOG_NUM_MTT 30 static struct mlx4_profile mod_param_profile = { .num_qp = 19, .num_srq = 16, .rdmarc_per_qp = 4, .num_cq = 16, .num_mcg = 13, .num_mpt = 19, .num_mtt_segs = 0, /* max(20, 2*MTTs for host memory)) */ }; module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444); MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA (default: 19)"); module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444); MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA " "(default: 16)"); module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 0444); MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP " "(default: 4)"); module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444); MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA (default: 16)"); module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444); MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA " "(default: 13)"); module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444); MODULE_PARM_DESC(log_num_mpt, "log maximum number of memory protection table entries per " "HCA (default: 19)"); module_param_named(log_num_mtt, mod_param_profile.num_mtt_segs, int, 0444); MODULE_PARM_DESC(log_num_mtt, "log maximum number of memory translation table segments per " "HCA (default: max(20, 2*MTTs for register all of the host memory limited to 30))"); enum { MLX4_IF_STATE_BASIC, MLX4_IF_STATE_EXTENDED }; static inline u64 dbdf_to_u64(int domain, int bus, int dev, int fn) { return (domain << 20) | (bus << 12) | (dev << 4) | fn; } static inline void pr_bdf_err(const char *dbdf, const char *pname) { pr_warn("mlx4_core: '%s' is not valid bdf in '%s'\n", dbdf, pname); } static inline void pr_val_err(const char *dbdf, const char *pname, const char *val) { pr_warn("mlx4_core: value '%s' of bdf '%s' in '%s' is not valid\n" , val, dbdf, pname); } static inline void pr_out_of_range_bdf(const char *dbdf, int val, struct mlx4_dbdf2val_lst *dbdf2val) { pr_warn("mlx4_core: value %d in bdf '%s' of '%s' is out of its valid range (%d,%d)\n" , val, dbdf, dbdf2val->name , dbdf2val->range.min, dbdf2val->range.max); } static inline void pr_out_of_range(struct mlx4_dbdf2val_lst *dbdf2val) { pr_warn("mlx4_core: value of '%s' is out of its valid range (%d,%d)\n" , dbdf2val->name , dbdf2val->range.min, dbdf2val->range.max); } static inline int is_in_range(int val, struct mlx4_range *r) { return (val >= r->min && val <= r->max); } static int update_defaults(struct param_data *pdata) { long int val[MLX4_MAX_BDF_VALS]; int ret; char *t, *p = pdata->dbdf2val.str; char sval[32]; int val_len; if (!strlen(p) || strchr(p, ':') || strchr(p, '.') || strchr(p, ';')) return INVALID_STR; switch (pdata->id) { case PORT_TYPE_ARRAY: t = strchr(p, ','); if (!t || t == p || (t - p) > sizeof(sval)) return INVALID_STR; val_len = t - p; strncpy(sval, p, val_len); sval[val_len] = 0; ret = kstrtol(sval, 0, &val[0]); if (ret == -EINVAL) return INVALID_STR; if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) { pr_out_of_range(&pdata->dbdf2val); return INVALID_DATA; } ret = kstrtol(t + 1, 0, &val[1]); if (ret == -EINVAL) return INVALID_STR; if (ret || !is_in_range(val[1], &pdata->dbdf2val.range)) { pr_out_of_range(&pdata->dbdf2val); return INVALID_DATA; } pdata->dbdf2val.tbl[0].val[0] = val[0]; pdata->dbdf2val.tbl[0].val[1] = val[1]; break; case NUM_VFS: case PROBE_VF: ret = kstrtol(p, 0, &val[0]); if (ret == -EINVAL) return INVALID_STR; if (ret || !is_in_range(val[0], &pdata->dbdf2val.range)) { pr_out_of_range(&pdata->dbdf2val); return INVALID_DATA; } pdata->dbdf2val.tbl[0].val[0] = val[0]; break; } pdata->dbdf2val.tbl[1].dbdf = MLX4_ENDOF_TBL; return VALID_DATA; } int mlx4_fill_dbdf2val_tbl(struct mlx4_dbdf2val_lst *dbdf2val_lst) { int domain, bus, dev, fn; u64 dbdf; char *p, *t, *v; char tmp[32]; char sbdf[32]; char sep = ','; int j, k, str_size, i = 1; int prfx_size; p = dbdf2val_lst->str; for (j = 0; j < dbdf2val_lst->num_vals; j++) dbdf2val_lst->tbl[0].val[j] = dbdf2val_lst->def_val[j]; dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL; str_size = strlen(dbdf2val_lst->str); if (str_size == 0) return 0; while (strlen(p)) { prfx_size = BDF_STR_SIZE; sbdf[prfx_size] = 0; strncpy(sbdf, p, prfx_size); domain = DEFAULT_DOMAIN; if (sscanf(sbdf, "%02x:%02x.%x-", &bus, &dev, &fn) != 3) { prfx_size = DBDF_STR_SIZE; sbdf[prfx_size] = 0; strncpy(sbdf, p, prfx_size); if (sscanf(sbdf, "%04x:%02x:%02x.%x-", &domain, &bus, &dev, &fn) != 4) { pr_bdf_err(sbdf, dbdf2val_lst->name); goto err; } sprintf(tmp, "%04x:%02x:%02x.%x-", domain, bus, dev, fn); } else { sprintf(tmp, "%02x:%02x.%x-", bus, dev, fn); } if (strnicmp(sbdf, tmp, sizeof(tmp))) { pr_bdf_err(sbdf, dbdf2val_lst->name); goto err; } dbdf = dbdf_to_u64(domain, bus, dev, fn); for (j = 1; j < i; j++) if (dbdf2val_lst->tbl[j].dbdf == dbdf) { pr_warn("mlx4_core: in '%s', %s appears multiple times\n" , dbdf2val_lst->name, sbdf); goto err; } if (i >= MLX4_DEVS_TBL_SIZE) { pr_warn("mlx4_core: Too many devices in '%s'\n" , dbdf2val_lst->name); goto err; } p += prfx_size; t = strchr(p, sep); t = t ? t : p + strlen(p); if (p >= t) { pr_val_err(sbdf, dbdf2val_lst->name, ""); goto err; } for (k = 0; k < dbdf2val_lst->num_vals; k++) { char sval[32]; long int val; int ret, val_len; char vsep = ';'; v = (k == dbdf2val_lst->num_vals - 1) ? t : strchr(p, vsep); if (!v || v > t || v == p || (v - p) > sizeof(sval)) { pr_val_err(sbdf, dbdf2val_lst->name, p); goto err; } val_len = v - p; strncpy(sval, p, val_len); sval[val_len] = 0; ret = kstrtol(sval, 0, &val); if (ret) { if (strchr(p, vsep)) pr_warn("mlx4_core: too many vals in bdf '%s' of '%s'\n" , sbdf, dbdf2val_lst->name); else pr_val_err(sbdf, dbdf2val_lst->name, sval); goto err; } if (!is_in_range(val, &dbdf2val_lst->range)) { pr_out_of_range_bdf(sbdf, val, dbdf2val_lst); goto err; } dbdf2val_lst->tbl[i].val[k] = val; p = v; if (p[0] == vsep) p++; } dbdf2val_lst->tbl[i].dbdf = dbdf; if (strlen(p)) { if (p[0] != sep) { pr_warn("mlx4_core: expect separator '%c' before '%s' in '%s'\n" , sep, p, dbdf2val_lst->name); goto err; } p++; } i++; if (i < MLX4_DEVS_TBL_SIZE) dbdf2val_lst->tbl[i].dbdf = MLX4_ENDOF_TBL; } return 0; err: dbdf2val_lst->tbl[1].dbdf = MLX4_ENDOF_TBL; pr_warn("mlx4_core: The value of '%s' is incorrect. The value is discarded!\n" , dbdf2val_lst->name); return -EINVAL; } EXPORT_SYMBOL(mlx4_fill_dbdf2val_tbl); int mlx4_get_val(struct mlx4_dbdf2val *tbl, struct pci_dev *pdev, int idx, int *val) { u64 dbdf; int i = 1; *val = tbl[0].val[idx]; if (!pdev) return -EINVAL; dbdf = dbdf_to_u64(pci_get_domain(pdev->dev.bsddev), pci_get_bus(pdev->dev.bsddev), PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); while ((i < MLX4_DEVS_TBL_SIZE) && (tbl[i].dbdf != MLX4_ENDOF_TBL)) { if (tbl[i].dbdf == dbdf) { *val = tbl[i].val[idx]; return 0; } i++; } return 0; } EXPORT_SYMBOL(mlx4_get_val); static void process_mod_param_profile(struct mlx4_profile *profile) { vm_size_t hwphyssz; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.realmem", (u_long *) &hwphyssz); profile->num_qp = 1 << mod_param_profile.num_qp; profile->num_srq = 1 << mod_param_profile.num_srq; profile->rdmarc_per_qp = 1 << mod_param_profile.rdmarc_per_qp; profile->num_cq = 1 << mod_param_profile.num_cq; profile->num_mcg = 1 << mod_param_profile.num_mcg; profile->num_mpt = 1 << mod_param_profile.num_mpt; /* * We want to scale the number of MTTs with the size of the * system memory, since it makes sense to register a lot of * memory on a system with a lot of memory. As a heuristic, * make sure we have enough MTTs to register twice the system * memory (with PAGE_SIZE entries). * * This number has to be a power of two and fit into 32 bits * due to device limitations. We cap this at 2^30 as of bit map * limitation to work with int instead of uint (mlx4_buddy_init -> bitmap_zero) * That limits us to 4TB of memory registration per HCA with * 4KB pages, which is probably OK for the next few months. */ if (mod_param_profile.num_mtt_segs) profile->num_mtt_segs = 1 << mod_param_profile.num_mtt_segs; else { profile->num_mtt_segs = roundup_pow_of_two(max_t(unsigned, 1 << (MLX4_LOG_NUM_MTT - log_mtts_per_seg), min(1UL << (MLX4_MAX_LOG_NUM_MTT - log_mtts_per_seg), (hwphyssz << 1) >> log_mtts_per_seg))); /* set the actual value, so it will be reflected to the user using the sysfs */ mod_param_profile.num_mtt_segs = ilog2(profile->num_mtt_segs); } } int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) { int i; for (i = 0; i < dev->caps.num_ports - 1; i++) { if (port_type[i] != port_type[i + 1]) { if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { mlx4_err(dev, "Only same port types supported " "on this HCA, aborting.\n"); return -EINVAL; } } } for (i = 0; i < dev->caps.num_ports; i++) { if (!(port_type[i] & dev->caps.supported_type[i+1])) { mlx4_err(dev, "Requested port type for port %d is not " "supported on this HCA\n", i + 1); return -EINVAL; } } return 0; } static void mlx4_set_port_mask(struct mlx4_dev *dev) { int i; for (i = 1; i <= dev->caps.num_ports; ++i) dev->caps.port_mask[i] = dev->caps.port_type[i]; } static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; int i; err = mlx4_QUERY_DEV_CAP(dev, dev_cap); if (err) { mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); return err; } if (dev_cap->min_page_sz > PAGE_SIZE) { mlx4_err(dev, "HCA minimum page size of %d bigger than " "kernel PAGE_SIZE of %d, aborting.\n", dev_cap->min_page_sz, PAGE_SIZE); return -ENODEV; } if (dev_cap->num_ports > MLX4_MAX_PORTS) { mlx4_err(dev, "HCA has %d ports, but we only support %d, " "aborting.\n", dev_cap->num_ports, MLX4_MAX_PORTS); return -ENODEV; } if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) { mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than " "PCI resource 2 size of 0x%llx, aborting.\n", dev_cap->uar_size, (unsigned long long) pci_resource_len(dev->pdev, 2)); return -ENODEV; } dev->caps.num_ports = dev_cap->num_ports; dev->phys_caps.num_phys_eqs = MLX4_MAX_EQ_NUM; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.vl_cap[i] = dev_cap->max_vl[i]; dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i]; dev->phys_caps.gid_phys_table_len[i] = dev_cap->max_gids[i]; dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i]; /* set gid and pkey table operating lengths by default * to non-sriov values */ dev->caps.gid_table_len[i] = dev_cap->max_gids[i]; dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i]; dev->caps.port_width_cap[i] = dev_cap->max_port_width[i]; dev->caps.eth_mtu_cap[i] = dev_cap->eth_mtu[i]; dev->caps.def_mac[i] = dev_cap->def_mac[i]; dev->caps.supported_type[i] = dev_cap->supported_port_types[i]; dev->caps.suggested_type[i] = dev_cap->suggested_type[i]; dev->caps.default_sense[i] = dev_cap->default_sense[i]; dev->caps.trans_type[i] = dev_cap->trans_type[i]; dev->caps.vendor_oui[i] = dev_cap->vendor_oui[i]; dev->caps.wavelength[i] = dev_cap->wavelength[i]; dev->caps.trans_code[i] = dev_cap->trans_code[i]; } dev->caps.uar_page_size = PAGE_SIZE; dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE; dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay; dev->caps.bf_reg_size = dev_cap->bf_reg_size; dev->caps.bf_regs_per_page = dev_cap->bf_regs_per_page; dev->caps.max_sq_sg = dev_cap->max_sq_sg; dev->caps.max_rq_sg = dev_cap->max_rq_sg; dev->caps.max_wqes = dev_cap->max_qp_sz; dev->caps.max_qp_init_rdma = dev_cap->max_requester_per_qp; dev->caps.max_srq_wqes = dev_cap->max_srq_sz; dev->caps.max_srq_sge = dev_cap->max_rq_sg - 1; dev->caps.reserved_srqs = dev_cap->reserved_srqs; dev->caps.max_sq_desc_sz = dev_cap->max_sq_desc_sz; dev->caps.max_rq_desc_sz = dev_cap->max_rq_desc_sz; /* * Subtract 1 from the limit because we need to allocate a * spare CQE to enable resizing the CQ */ dev->caps.max_cqes = dev_cap->max_cq_sz - 1; dev->caps.reserved_cqs = dev_cap->reserved_cqs; dev->caps.reserved_eqs = dev_cap->reserved_eqs; dev->caps.reserved_mtts = dev_cap->reserved_mtts; dev->caps.reserved_mrws = dev_cap->reserved_mrws; /* The first 128 UARs are used for EQ doorbells */ dev->caps.reserved_uars = max_t(int, 128, dev_cap->reserved_uars); dev->caps.reserved_pds = dev_cap->reserved_pds; dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? dev_cap->reserved_xrcds : 0; dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? dev_cap->max_xrcds : 0; dev->caps.mtt_entry_sz = dev_cap->mtt_entry_sz; dev->caps.max_msg_sz = dev_cap->max_msg_sz; dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1); dev->caps.flags = dev_cap->flags; dev->caps.flags2 = dev_cap->flags2; dev->caps.bmme_flags = dev_cap->bmme_flags; dev->caps.reserved_lkey = dev_cap->reserved_lkey; dev->caps.stat_rate_support = dev_cap->stat_rate_support; dev->caps.cq_timestamp = dev_cap->timestamp_support; dev->caps.max_gso_sz = dev_cap->max_gso_sz; dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; /* Sense port always allowed on supported devices for ConnectX-1 and -2 */ if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT) dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; /* Don't do sense port on multifunction devices (for now at least) */ if (mlx4_is_mfunc(dev)) dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; dev->caps.log_num_macs = log_num_mac; dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS; dev->caps.fast_drop = fast_drop ? !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FAST_DROP) : 0; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE; if (dev->caps.supported_type[i]) { /* if only ETH is supported - assign ETH */ if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH) dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; /* if only IB is supported, assign IB */ else if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_IB) dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; else { /* * if IB and ETH are supported, we set the port * type according to user selection of port type; * if there is no user selection, take the FW hint */ int pta; mlx4_get_val(port_type_array.dbdf2val.tbl, pci_physfn(dev->pdev), i - 1, &pta); if (pta == MLX4_PORT_TYPE_NONE) { dev->caps.port_type[i] = dev->caps.suggested_type[i] ? MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB; } else if (pta == MLX4_PORT_TYPE_NA) { mlx4_err(dev, "Port %d is valid port. " "It is not allowed to configure its type to N/A(%d)\n", i, MLX4_PORT_TYPE_NA); return -EINVAL; } else { dev->caps.port_type[i] = pta; } } } /* * Link sensing is allowed on the port if 3 conditions are true: * 1. Both protocols are supported on the port. * 2. Different types are supported on the port * 3. FW declared that it supports link sensing */ mlx4_priv(dev)->sense.sense_allowed[i] = ((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) && (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)); /* Disablling auto sense for default Eth ports support */ mlx4_priv(dev)->sense.sense_allowed[i] = 0; /* * If "default_sense" bit is set, we move the port to "AUTO" mode * and perform sense_port FW command to try and set the correct * port type from beginning */ if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) { enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE; dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO; mlx4_SENSE_PORT(dev, i, &sensed_port); if (sensed_port != MLX4_PORT_TYPE_NONE) dev->caps.port_type[i] = sensed_port; } else { dev->caps.possible_type[i] = dev->caps.port_type[i]; } if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) { dev->caps.log_num_macs = dev_cap->log_max_macs[i]; mlx4_warn(dev, "Requested number of MACs is too much " "for port %d, reducing to %d.\n", i, 1 << dev->caps.log_num_macs); } if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) { dev->caps.log_num_vlans = dev_cap->log_max_vlans[i]; mlx4_warn(dev, "Requested number of VLANs is too much " "for port %d, reducing to %d.\n", i, 1 << dev->caps.log_num_vlans); } } dev->caps.max_basic_counters = dev_cap->max_basic_counters; dev->caps.max_extended_counters = dev_cap->max_extended_counters; /* support extended counters if available */ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS_EXT) dev->caps.max_counters = dev->caps.max_extended_counters; else dev->caps.max_counters = dev->caps.max_basic_counters; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] = (1 << dev->caps.log_num_macs) * (1 << dev->caps.log_num_vlans) * dev->caps.num_ports; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH; dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH]; dev->caps.sync_qp = dev_cap->sync_qp; if (dev->pdev->device == 0x1003) dev->caps.cq_flags |= MLX4_DEV_CAP_CQ_FLAG_IO; dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0; if (!mlx4_enable_64b_cqe_eqe && !mlx4_is_slave(dev)) { if (dev_cap->flags & (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) { mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n"); dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE; dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE; } } if ((dev->caps.flags & (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) && mlx4_is_master(dev)) dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE; if (!mlx4_is_slave(dev)) { for (i = 0; i < dev->caps.num_ports; ++i) dev->caps.def_counter_index[i] = i << 1; } return 0; } /*The function checks if there are live vf, return the num of them*/ static int mlx4_how_many_lives_vf(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_slave_state *s_state; int i; int ret = 0; for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) { s_state = &priv->mfunc.master.slave_state[i]; if (s_state->active && s_state->last_cmd != MLX4_COMM_CMD_RESET) { mlx4_warn(dev, "%s: slave: %d is still active\n", __func__, i); ret++; } } return ret; } int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey) { u32 qk = MLX4_RESERVED_QKEY_BASE; if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX || qpn < dev->phys_caps.base_proxy_sqpn) return -EINVAL; if (qpn >= dev->phys_caps.base_tunnel_sqpn) /* tunnel qp */ qk += qpn - dev->phys_caps.base_tunnel_sqpn; else qk += qpn - dev->phys_caps.base_proxy_sqpn; *qkey = qk; return 0; } EXPORT_SYMBOL(mlx4_get_parav_qkey); void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val) { struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); if (!mlx4_is_master(dev)) return; priv->virt2phys_pkey[slave][port - 1][i] = val; } EXPORT_SYMBOL(mlx4_sync_pkey_table); void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid) { struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); if (!mlx4_is_master(dev)) return; priv->slave_node_guids[slave] = guid; } EXPORT_SYMBOL(mlx4_put_slave_node_guid); __be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave) { struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); if (!mlx4_is_master(dev)) return 0; return priv->slave_node_guids[slave]; } EXPORT_SYMBOL(mlx4_get_slave_node_guid); int mlx4_is_slave_active(struct mlx4_dev *dev, int slave) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_slave_state *s_slave; if (!mlx4_is_master(dev)) return 0; s_slave = &priv->mfunc.master.slave_state[slave]; return !!s_slave->active; } EXPORT_SYMBOL(mlx4_is_slave_active); static void slave_adjust_steering_mode(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *hca_param) { dev->caps.steering_mode = hca_param->steering_mode; if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; else dev->caps.num_qp_per_mgm = 4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2); mlx4_dbg(dev, "Steering mode is: %s\n", mlx4_steering_mode_str(dev->caps.steering_mode)); } static int mlx4_slave_cap(struct mlx4_dev *dev) { int err; u32 page_size; struct mlx4_dev_cap dev_cap; struct mlx4_func_cap func_cap; struct mlx4_init_hca_param hca_param; int i; memset(&hca_param, 0, sizeof(hca_param)); err = mlx4_QUERY_HCA(dev, &hca_param); if (err) { mlx4_err(dev, "QUERY_HCA command failed, aborting.\n"); return err; } /*fail if the hca has an unknown capability */ if ((hca_param.global_caps | HCA_GLOBAL_CAP_MASK) != HCA_GLOBAL_CAP_MASK) { mlx4_err(dev, "Unknown hca global capabilities\n"); return -ENOSYS; } mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz; dev->caps.hca_core_clock = hca_param.hca_core_clock; memset(&dev_cap, 0, sizeof(dev_cap)); dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp; err = mlx4_dev_cap(dev, &dev_cap); if (err) { mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); return err; } err = mlx4_QUERY_FW(dev); if (err) mlx4_err(dev, "QUERY_FW command failed: could not get FW version.\n"); if (!hca_param.mw_enable) { dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW; dev->caps.bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN; } page_size = ~dev->caps.page_size_cap + 1; mlx4_warn(dev, "HCA minimum page size:%d\n", page_size); if (page_size > PAGE_SIZE) { mlx4_err(dev, "HCA minimum page size of %d bigger than " "kernel PAGE_SIZE of %d, aborting.\n", page_size, PAGE_SIZE); return -ENODEV; } /* slave gets uar page size from QUERY_HCA fw command */ dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12); /* TODO: relax this assumption */ if (dev->caps.uar_page_size != PAGE_SIZE) { mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %d\n", dev->caps.uar_page_size, PAGE_SIZE); return -ENODEV; } memset(&func_cap, 0, sizeof(func_cap)); err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap); if (err) { mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d).\n", err); return err; } if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) != PF_CONTEXT_BEHAVIOUR_MASK) { mlx4_err(dev, "Unknown pf context behaviour\n"); return -ENOSYS; } dev->caps.num_ports = func_cap.num_ports; dev->quotas.qp = func_cap.qp_quota; dev->quotas.srq = func_cap.srq_quota; dev->quotas.cq = func_cap.cq_quota; dev->quotas.mpt = func_cap.mpt_quota; dev->quotas.mtt = func_cap.mtt_quota; dev->caps.num_qps = 1 << hca_param.log_num_qps; dev->caps.num_srqs = 1 << hca_param.log_num_srqs; dev->caps.num_cqs = 1 << hca_param.log_num_cqs; dev->caps.num_mpts = 1 << hca_param.log_mpt_sz; dev->caps.num_eqs = func_cap.max_eq; dev->caps.reserved_eqs = func_cap.reserved_eq; dev->caps.num_pds = MLX4_NUM_PDS; dev->caps.num_mgms = 0; dev->caps.num_amgms = 0; if (dev->caps.num_ports > MLX4_MAX_PORTS) { mlx4_err(dev, "HCA has %d ports, but we only support %d, " "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS); return -ENODEV; } dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL); if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy || !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) { err = -ENOMEM; goto err_mem; } for (i = 1; i <= dev->caps.num_ports; ++i) { err = mlx4_QUERY_FUNC_CAP(dev, (u32) i, &func_cap); if (err) { mlx4_err(dev, "QUERY_FUNC_CAP port command failed for" " port %d, aborting (%d).\n", i, err); goto err_mem; } dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn; dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn; dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn; dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn; dev->caps.def_counter_index[i - 1] = func_cap.def_counter_index; dev->caps.port_mask[i] = dev->caps.port_type[i]; err = mlx4_get_slave_pkey_gid_tbl_len(dev, i, &dev->caps.gid_table_len[i], &dev->caps.pkey_table_len[i]); if (err) goto err_mem; } if (dev->caps.uar_page_size * (dev->caps.num_uars - dev->caps.reserved_uars) > pci_resource_len(dev->pdev, 2)) { mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than " "PCI resource 2 size of 0x%llx, aborting.\n", dev->caps.uar_page_size * dev->caps.num_uars, (unsigned long long) pci_resource_len(dev->pdev, 2)); err = -ENOMEM; goto err_mem; } if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) { dev->caps.eqe_size = 64; dev->caps.eqe_factor = 1; } else { dev->caps.eqe_size = 32; dev->caps.eqe_factor = 0; } if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) { dev->caps.cqe_size = 64; dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_64B_CQE; } else { dev->caps.cqe_size = 32; } dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_warn(dev, "Timestamping is not supported in slave mode.\n"); slave_adjust_steering_mode(dev, &dev_cap, &hca_param); return 0; err_mem: kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); dev->caps.qp0_tunnel = dev->caps.qp0_proxy = dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL; return err; } static void mlx4_request_modules(struct mlx4_dev *dev) { int port; int has_ib_port = false; int has_eth_port = false; #define EN_DRV_NAME "mlx4_en" #define IB_DRV_NAME "mlx4_ib" for (port = 1; port <= dev->caps.num_ports; port++) { if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) has_ib_port = true; else if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) has_eth_port = true; } if (has_ib_port) request_module_nowait(IB_DRV_NAME); if (has_eth_port) request_module_nowait(EN_DRV_NAME); } /* * Change the port configuration of the device. * Every user of this function must hold the port mutex. */ int mlx4_change_port_types(struct mlx4_dev *dev, enum mlx4_port_type *port_types) { int err = 0; int change = 0; int port; for (port = 0; port < dev->caps.num_ports; port++) { /* Change the port type only if the new type is different * from the current, and not set to Auto */ if (port_types[port] != dev->caps.port_type[port + 1]) change = 1; } if (change) { mlx4_unregister_device(dev); for (port = 1; port <= dev->caps.num_ports; port++) { mlx4_CLOSE_PORT(dev, port); dev->caps.port_type[port] = port_types[port - 1]; err = mlx4_SET_PORT(dev, port, -1); if (err) { mlx4_err(dev, "Failed to set port %d, " "aborting\n", port); goto out; } } mlx4_set_port_mask(dev); err = mlx4_register_device(dev); if (err) { mlx4_err(dev, "Failed to register device\n"); goto out; } mlx4_request_modules(dev); } out: return err; } static ssize_t show_port_type(struct device *dev, struct device_attribute *attr, char *buf) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_attr); struct mlx4_dev *mdev = info->dev; char type[8]; sprintf(type, "%s", (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ? "ib" : "eth"); if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO) sprintf(buf, "auto (%s)\n", type); else sprintf(buf, "%s\n", type); return strlen(buf); } static ssize_t set_port_type(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_attr); struct mlx4_dev *mdev = info->dev; struct mlx4_priv *priv = mlx4_priv(mdev); enum mlx4_port_type types[MLX4_MAX_PORTS]; enum mlx4_port_type new_types[MLX4_MAX_PORTS]; int i; int err = 0; if (!strcmp(buf, "ib\n")) info->tmp_type = MLX4_PORT_TYPE_IB; else if (!strcmp(buf, "eth\n")) info->tmp_type = MLX4_PORT_TYPE_ETH; else if (!strcmp(buf, "auto\n")) info->tmp_type = MLX4_PORT_TYPE_AUTO; else { mlx4_err(mdev, "%s is not supported port type\n", buf); return -EINVAL; } if ((info->tmp_type & mdev->caps.supported_type[info->port]) != info->tmp_type) { mlx4_err(mdev, "Requested port type for port %d is not supported on this HCA\n", info->port); return -EINVAL; } mlx4_stop_sense(mdev); mutex_lock(&priv->port_mutex); /* Possible type is always the one that was delivered */ mdev->caps.possible_type[info->port] = info->tmp_type; for (i = 0; i < mdev->caps.num_ports; i++) { types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type : mdev->caps.possible_type[i+1]; if (types[i] == MLX4_PORT_TYPE_AUTO) types[i] = mdev->caps.port_type[i+1]; } if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) && !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) { for (i = 1; i <= mdev->caps.num_ports; i++) { if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) { mdev->caps.possible_type[i] = mdev->caps.port_type[i]; err = -EINVAL; } } } if (err) { mlx4_err(mdev, "Auto sensing is not supported on this HCA. " "Set only 'eth' or 'ib' for both ports " "(should be the same)\n"); goto out; } mlx4_do_sense_ports(mdev, new_types, types); err = mlx4_check_port_params(mdev, new_types); if (err) goto out; /* We are about to apply the changes after the configuration * was verified, no need to remember the temporary types * any more */ for (i = 0; i < mdev->caps.num_ports; i++) priv->port[i + 1].tmp_type = 0; err = mlx4_change_port_types(mdev, new_types); out: mlx4_start_sense(mdev); mutex_unlock(&priv->port_mutex); return err ? err : count; } enum ibta_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; static inline int int_to_ibta_mtu(int mtu) { switch (mtu) { case 256: return IB_MTU_256; case 512: return IB_MTU_512; case 1024: return IB_MTU_1024; case 2048: return IB_MTU_2048; case 4096: return IB_MTU_4096; default: return -1; } } static inline int ibta_mtu_to_int(enum ibta_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } static ssize_t show_port_ib_mtu(struct device *dev, struct device_attribute *attr, char *buf) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_mtu_attr); struct mlx4_dev *mdev = info->dev; if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) mlx4_warn(mdev, "port level mtu is only used for IB ports\n"); sprintf(buf, "%d\n", ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port])); return strlen(buf); } static ssize_t set_port_ib_mtu(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info, port_mtu_attr); struct mlx4_dev *mdev = info->dev; struct mlx4_priv *priv = mlx4_priv(mdev); int err, port, mtu, ibta_mtu = -1; if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) { mlx4_warn(mdev, "port level mtu is only used for IB ports\n"); return -EINVAL; } mtu = (int) simple_strtol(buf, NULL, 0); ibta_mtu = int_to_ibta_mtu(mtu); if (ibta_mtu < 0) { mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf); return -EINVAL; } mdev->caps.port_ib_mtu[info->port] = ibta_mtu; mlx4_stop_sense(mdev); mutex_lock(&priv->port_mutex); mlx4_unregister_device(mdev); for (port = 1; port <= mdev->caps.num_ports; port++) { mlx4_CLOSE_PORT(mdev, port); err = mlx4_SET_PORT(mdev, port, -1); if (err) { mlx4_err(mdev, "Failed to set port %d, " "aborting\n", port); goto err_set_port; } } err = mlx4_register_device(mdev); err_set_port: mutex_unlock(&priv->port_mutex); mlx4_start_sense(mdev); return err ? err : count; } static int mlx4_load_fw(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err, unmap_flag = 0; priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!priv->fw.fw_icm) { mlx4_err(dev, "Couldn't allocate FW area, aborting.\n"); return -ENOMEM; } err = mlx4_MAP_FA(dev, priv->fw.fw_icm); if (err) { mlx4_err(dev, "MAP_FA command failed, aborting.\n"); goto err_free; } err = mlx4_RUN_FW(dev); if (err) { mlx4_err(dev, "RUN_FW command failed, aborting.\n"); goto err_unmap_fa; } return 0; err_unmap_fa: unmap_flag = mlx4_UNMAP_FA(dev); if (unmap_flag) pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n"); err_free: if (!unmap_flag) mlx4_free_icm(dev, priv->fw.fw_icm, 0); return err; } static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, int cmpt_entry_sz) { struct mlx4_priv *priv = mlx4_priv(dev); int err; int num_eqs; err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_QP * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) goto err; err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_SRQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_srqs, dev->caps.reserved_srqs, 0, 0); if (err) goto err_qp; err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_CQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, dev->caps.num_cqs, dev->caps.reserved_cqs, 0, 0); if (err) goto err_srq; num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_EQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), cmpt_entry_sz, num_eqs, num_eqs, 0, 0); if (err) goto err_cq; return 0; err_cq: mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); err_srq: mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); err_qp: mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); err: return err; } static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *init_hca, u64 icm_size) { struct mlx4_priv *priv = mlx4_priv(dev); u64 aux_pages; int num_eqs; int err, unmap_flag = 0; err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages); if (err) { mlx4_err(dev, "SET_ICM_SIZE command failed, aborting.\n"); return err; } mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory.\n", (unsigned long long) icm_size >> 10, (unsigned long long) aux_pages << 2); priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages, GFP_HIGHUSER | __GFP_NOWARN, 0); if (!priv->fw.aux_icm) { mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n"); return -ENOMEM; } err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm); if (err) { mlx4_err(dev, "MAP_ICM_AUX command failed, aborting.\n"); goto err_free_aux; } err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz); if (err) { mlx4_err(dev, "Failed to map cMPT context memory, aborting.\n"); goto err_unmap_aux; } num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.table, init_hca->eqc_base, dev_cap->eqc_entry_sz, num_eqs, num_eqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_cmpt; } /* * Reserved MTT entries must be aligned up to a cacheline * boundary, since the FW will write to them, while the driver * writes to all other MTT entries. (The variable * dev->caps.mtt_entry_sz below is really the MTT segment * size, not the raw entry size) */ dev->caps.reserved_mtts = ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz, dma_get_cache_alignment()) / dev->caps.mtt_entry_sz; err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table, init_hca->mtt_base, dev->caps.mtt_entry_sz, dev->caps.num_mtts, dev->caps.reserved_mtts, 1, 0); if (err) { mlx4_err(dev, "Failed to map MTT context memory, aborting.\n"); goto err_unmap_eq; } err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table, init_hca->dmpt_base, dev_cap->dmpt_entry_sz, dev->caps.num_mpts, dev->caps.reserved_mrws, 1, 1); if (err) { mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n"); goto err_unmap_mtt; } err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table, init_hca->qpc_base, dev_cap->qpc_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map QP context memory, aborting.\n"); goto err_unmap_dmpt; } err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table, init_hca->auxc_base, dev_cap->aux_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n"); goto err_unmap_qp; } err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table, init_hca->altc_base, dev_cap->altc_entry_sz, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n"); goto err_unmap_auxc; } err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table, init_hca->rdmarc_base, dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift, dev->caps.num_qps, dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 0, 0); if (err) { mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n"); goto err_unmap_altc; } err = mlx4_init_icm_table(dev, &priv->cq_table.table, init_hca->cqc_base, dev_cap->cqc_entry_sz, dev->caps.num_cqs, dev->caps.reserved_cqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map CQ context memory, aborting.\n"); goto err_unmap_rdmarc; } err = mlx4_init_icm_table(dev, &priv->srq_table.table, init_hca->srqc_base, dev_cap->srq_entry_sz, dev->caps.num_srqs, dev->caps.reserved_srqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n"); goto err_unmap_cq; } /* * For flow steering device managed mode it is required to use * mlx4_init_icm_table. For B0 steering mode it's not strictly * required, but for simplicity just map the whole multicast * group table now. The table isn't very big and it's a lot * easier than trying to track ref counts. */ err = mlx4_init_icm_table(dev, &priv->mcg_table.table, init_hca->mc_base, mlx4_get_mgm_entry_size(dev), dev->caps.num_mgms + dev->caps.num_amgms, dev->caps.num_mgms + dev->caps.num_amgms, 0, 0); if (err) { mlx4_err(dev, "Failed to map MCG context memory, aborting.\n"); goto err_unmap_srq; } return 0; err_unmap_srq: mlx4_cleanup_icm_table(dev, &priv->srq_table.table); err_unmap_cq: mlx4_cleanup_icm_table(dev, &priv->cq_table.table); err_unmap_rdmarc: mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); err_unmap_altc: mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); err_unmap_auxc: mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); err_unmap_qp: mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); err_unmap_dmpt: mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); err_unmap_mtt: mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); err_unmap_eq: mlx4_cleanup_icm_table(dev, &priv->eq_table.table); err_unmap_cmpt: mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); err_unmap_aux: unmap_flag = mlx4_UNMAP_ICM_AUX(dev); if (unmap_flag) pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n"); err_free_aux: if (!unmap_flag) mlx4_free_icm(dev, priv->fw.aux_icm, 0); return err; } static void mlx4_free_icms(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); mlx4_cleanup_icm_table(dev, &priv->mcg_table.table); mlx4_cleanup_icm_table(dev, &priv->srq_table.table); mlx4_cleanup_icm_table(dev, &priv->cq_table.table); mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); mlx4_cleanup_icm_table(dev, &priv->eq_table.table); mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); if (!mlx4_UNMAP_ICM_AUX(dev)) mlx4_free_icm(dev, priv->fw.aux_icm, 0); else pr_warn("mlx4_core: mlx4_UNMAP_ICM_AUX failed.\n"); } static void mlx4_slave_exit(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); mutex_lock(&priv->cmd.slave_cmd_mutex); if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) mlx4_warn(dev, "Failed to close slave function.\n"); mutex_unlock(&priv->cmd.slave_cmd_mutex); } static int map_bf_area(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); resource_size_t bf_start; resource_size_t bf_len; int err = 0; if (!dev->caps.bf_reg_size) return -ENXIO; bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT); bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT); priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len); if (!priv->bf_mapping) err = -ENOMEM; return err; } static void unmap_bf_area(struct mlx4_dev *dev) { if (mlx4_priv(dev)->bf_mapping) io_mapping_free(mlx4_priv(dev)->bf_mapping); } int mlx4_read_clock(struct mlx4_dev *dev) { u32 clockhi, clocklo, clockhi1; cycle_t cycles; int i; struct mlx4_priv *priv = mlx4_priv(dev); if (!priv->clock_mapping) return -ENOTSUPP; for (i = 0; i < 10; i++) { clockhi = swab32(readl(priv->clock_mapping)); clocklo = swab32(readl(priv->clock_mapping + 4)); clockhi1 = swab32(readl(priv->clock_mapping)); if (clockhi == clockhi1) break; } cycles = (u64) clockhi << 32 | (u64) clocklo; return cycles; } EXPORT_SYMBOL_GPL(mlx4_read_clock); static int map_internal_clock(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); priv->clock_mapping = ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) + priv->fw.clock_offset, MLX4_CLOCK_SIZE); if (!priv->clock_mapping) return -ENOMEM; return 0; } int mlx4_get_internal_clock_params(struct mlx4_dev *dev, struct mlx4_clock_params *params) { struct mlx4_priv *priv = mlx4_priv(dev); if (mlx4_is_slave(dev)) return -ENOTSUPP; if (!params) return -EINVAL; params->bar = priv->fw.clock_bar; params->offset = priv->fw.clock_offset; params->size = MLX4_CLOCK_SIZE; return 0; } EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params); static void unmap_internal_clock(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); if (priv->clock_mapping) iounmap(priv->clock_mapping); } static void mlx4_close_hca(struct mlx4_dev *dev) { unmap_internal_clock(dev); unmap_bf_area(dev); if (mlx4_is_slave(dev)) { mlx4_slave_exit(dev); } else { mlx4_CLOSE_HCA(dev, 0); mlx4_free_icms(dev); if (!mlx4_UNMAP_FA(dev)) mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); else pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n"); } } static int mlx4_init_slave(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); u64 dma = (u64) priv->mfunc.vhcr_dma; int num_of_reset_retries = NUM_OF_RESET_RETRIES; int ret_from_reset = 0; u32 slave_read; u32 cmd_channel_ver; mutex_lock(&priv->cmd.slave_cmd_mutex); priv->cmd.max_cmds = 1; mlx4_warn(dev, "Sending reset\n"); ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME); /* if we are in the middle of flr the slave will try * NUM_OF_RESET_RETRIES times before leaving.*/ if (ret_from_reset) { if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) { msleep(SLEEP_TIME_IN_RESET); while (ret_from_reset && num_of_reset_retries) { mlx4_warn(dev, "slave is currently in the" "middle of FLR. retrying..." "(try num:%d)\n", (NUM_OF_RESET_RETRIES - num_of_reset_retries + 1)); ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME); num_of_reset_retries = num_of_reset_retries - 1; } } else goto err; } /* check the driver version - the slave I/F revision * must match the master's */ slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); cmd_channel_ver = mlx4_comm_get_version(); if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) != MLX4_COMM_GET_IF_REV(slave_read)) { mlx4_err(dev, "slave driver version is not supported" " by the master\n"); goto err; } mlx4_warn(dev, "Sending vhcr0\n"); if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48, MLX4_COMM_TIME)) goto err; if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32, MLX4_COMM_TIME)) goto err; if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16, MLX4_COMM_TIME)) goto err; if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME)) goto err; mutex_unlock(&priv->cmd.slave_cmd_mutex); return 0; err: mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0); mutex_unlock(&priv->cmd.slave_cmd_mutex); return -EIO; } static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev) { int i; for (i = 1; i <= dev->caps.num_ports; i++) { if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) dev->caps.gid_table_len[i] = mlx4_get_slave_num_gids(dev, 0); else dev->caps.gid_table_len[i] = 1; dev->caps.pkey_table_len[i] = dev->phys_caps.pkey_phys_table_len[i] - 1; } } static int choose_log_fs_mgm_entry_size(int qp_per_entry) { int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE; i++) { if (qp_per_entry <= 4 * ((1 << i) / 16 - 2)) break; } return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1; } static void choose_steering_mode(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int nvfs; mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(dev->pdev), 0, &nvfs); if (high_rate_steer && !mlx4_is_mfunc(dev)) { dev->caps.flags &= ~(MLX4_DEV_CAP_FLAG_VEP_MC_STEER | MLX4_DEV_CAP_FLAG_VEP_UC_STEER); dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_FS_EN; } if (mlx4_log_num_mgm_entry_size == -1 && dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN && (!mlx4_is_mfunc(dev) || (dev_cap->fs_max_num_qp_per_entry >= (nvfs + 1))) && choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >= MLX4_MIN_MGM_LOG_ENTRY_SIZE) { dev->oper_log_mgm_entry_size = choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry); dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED; dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry; } else { if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER && dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) dev->caps.steering_mode = MLX4_STEERING_MODE_B0; else { dev->caps.steering_mode = MLX4_STEERING_MODE_A0; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER || dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags " "set to use B0 steering. Falling back to A0 steering mode.\n"); } dev->oper_log_mgm_entry_size = mlx4_log_num_mgm_entry_size > 0 ? mlx4_log_num_mgm_entry_size : MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE; dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev); } mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, " "log_num_mgm_entry_size = %d\n", mlx4_steering_mode_str(dev->caps.steering_mode), dev->oper_log_mgm_entry_size, mlx4_log_num_mgm_entry_size); } static int mlx4_init_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_dev_cap *dev_cap = NULL; struct mlx4_adapter adapter; struct mlx4_mod_stat_cfg mlx4_cfg; struct mlx4_profile profile; struct mlx4_init_hca_param init_hca; u64 icm_size; int err; if (!mlx4_is_slave(dev)) { err = mlx4_QUERY_FW(dev); if (err) { if (err == -EACCES) mlx4_info(dev, "non-primary physical function, skipping.\n"); else mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); return err; } err = mlx4_load_fw(dev); if (err) { mlx4_err(dev, "Failed to start FW, aborting.\n"); return err; } mlx4_cfg.log_pg_sz_m = 1; mlx4_cfg.log_pg_sz = 0; err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); if (err) mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); dev_cap = kzalloc(sizeof *dev_cap, GFP_KERNEL); if (!dev_cap) { mlx4_err(dev, "Failed to allocate memory for dev_cap\n"); err = -ENOMEM; goto err_stop_fw; } err = mlx4_dev_cap(dev, dev_cap); if (err) { mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); goto err_stop_fw; } choose_steering_mode(dev, dev_cap); if (mlx4_is_master(dev)) mlx4_parav_master_pf_caps(dev); process_mod_param_profile(&profile); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) profile.num_mcg = MLX4_FS_NUM_MCG; icm_size = mlx4_make_profile(dev, &profile, dev_cap, &init_hca); if ((long long) icm_size < 0) { err = icm_size; goto err_stop_fw; } dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1; init_hca.log_uar_sz = ilog2(dev->caps.num_uars); init_hca.uar_page_sz = PAGE_SHIFT - 12; err = mlx4_init_icm(dev, dev_cap, &init_hca, icm_size); if (err) goto err_stop_fw; init_hca.mw_enable = 1; err = mlx4_INIT_HCA(dev, &init_hca); if (err) { mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); goto err_free_icm; } /* * Read HCA frequency by QUERY_HCA command */ if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) { memset(&init_hca, 0, sizeof(init_hca)); err = mlx4_QUERY_HCA(dev, &init_hca); if (err) { mlx4_err(dev, "QUERY_HCA command failed, disable timestamp.\n"); dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; } else { dev->caps.hca_core_clock = init_hca.hca_core_clock; } /* In case we got HCA frequency 0 - disable timestamping * to avoid dividing by zero */ if (!dev->caps.hca_core_clock) { dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_err(dev, "HCA frequency is 0. Timestamping is not supported."); } else if (map_internal_clock(dev)) { /* Map internal clock, * in case of failure disable timestamping */ dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS; mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported.\n"); } } } else { err = mlx4_init_slave(dev); if (err) { mlx4_err(dev, "Failed to initialize slave\n"); return err; } err = mlx4_slave_cap(dev); if (err) { mlx4_err(dev, "Failed to obtain slave caps\n"); goto err_close; } } if (map_bf_area(dev)) mlx4_dbg(dev, "Failed to map blue flame area\n"); /* Only the master set the ports, all the rest got it from it.*/ if (!mlx4_is_slave(dev)) mlx4_set_port_mask(dev); err = mlx4_QUERY_ADAPTER(dev, &adapter); if (err) { mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n"); goto unmap_bf; } priv->eq_table.inta_pin = adapter.inta_pin; memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id); memcpy(dev->vsd, adapter.vsd, sizeof(dev->vsd)); dev->vsd_vendor_id = adapter.vsd_vendor_id; if (!mlx4_is_slave(dev)) kfree(dev_cap); return 0; unmap_bf: if (!mlx4_is_slave(dev)) unmap_internal_clock(dev); unmap_bf_area(dev); if (mlx4_is_slave(dev)) { kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); } err_close: if (mlx4_is_slave(dev)) mlx4_slave_exit(dev); else mlx4_CLOSE_HCA(dev, 0); err_free_icm: if (!mlx4_is_slave(dev)) mlx4_free_icms(dev); err_stop_fw: if (!mlx4_is_slave(dev)) { if (!mlx4_UNMAP_FA(dev)) mlx4_free_icm(dev, priv->fw.fw_icm, 0); else pr_warn("mlx4_core: mlx4_UNMAP_FA failed.\n"); kfree(dev_cap); } return err; } static int mlx4_init_counters_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int nent_pow2, port_indx, vf_index, num_counters; int res, index = 0; struct counter_index *new_counter_index; if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) return -ENOENT; if (!mlx4_is_slave(dev) && dev->caps.max_counters == dev->caps.max_extended_counters) { res = mlx4_cmd(dev, MLX4_IF_STATE_EXTENDED, 0, 0, MLX4_CMD_SET_IF_STAT, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (res) { mlx4_err(dev, "Failed to set extended counters (err=%d)\n", res); return res; } } mutex_init(&priv->counters_table.mutex); if (mlx4_is_slave(dev)) { for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) { INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]); if (dev->caps.def_counter_index[port_indx] != 0xFF) { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) return -ENOMEM; new_counter_index->index = dev->caps.def_counter_index[port_indx]; list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port_indx]); } } mlx4_dbg(dev, "%s: slave allocated %d counters for %d ports\n", __func__, dev->caps.num_ports, dev->caps.num_ports); return 0; } nent_pow2 = roundup_pow_of_two(dev->caps.max_counters); for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) { INIT_LIST_HEAD(&priv->counters_table.global_port_list[port_indx]); /* allocating 2 counters per port for PFs */ /* For the PF, the ETH default counters are 0,2; */ /* and the RoCE default counters are 1,3 */ for (num_counters = 0; num_counters < 2; num_counters++, index++) { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) return -ENOMEM; new_counter_index->index = index; list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port_indx]); } } if (mlx4_is_master(dev)) { for (vf_index = 0; vf_index < dev->num_vfs; vf_index++) { for (port_indx = 0; port_indx < dev->caps.num_ports; port_indx++) { INIT_LIST_HEAD(&priv->counters_table.vf_list[vf_index][port_indx]); new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) return -ENOMEM; if (index < nent_pow2 - 2) { new_counter_index->index = index; index++; } else { new_counter_index->index = MLX4_SINK_COUNTER_INDEX; } list_add_tail(&new_counter_index->list, &priv->counters_table.vf_list[vf_index][port_indx]); } } res = mlx4_bitmap_init(&priv->counters_table.bitmap, nent_pow2, nent_pow2 - 1, index, 1); mlx4_dbg(dev, "%s: master allocated %d counters for %d VFs\n", __func__, index, dev->num_vfs); } else { res = mlx4_bitmap_init(&priv->counters_table.bitmap, nent_pow2, nent_pow2 - 1, index, 1); mlx4_dbg(dev, "%s: native allocated %d counters for %d ports\n", __func__, index, dev->caps.num_ports); } return 0; } static void mlx4_cleanup_counters_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int i, j; struct counter_index *port, *tmp_port; struct counter_index *vf, *tmp_vf; mutex_lock(&priv->counters_table.mutex); if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS) { for (i = 0; i < dev->caps.num_ports; i++) { list_for_each_entry_safe(port, tmp_port, &priv->counters_table.global_port_list[i], list) { list_del(&port->list); kfree(port); } } if (!mlx4_is_slave(dev)) { for (i = 0; i < dev->num_vfs; i++) { for (j = 0; j < dev->caps.num_ports; j++) { list_for_each_entry_safe(vf, tmp_vf, &priv->counters_table.vf_list[i][j], list) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, vf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, vf->index); list_del(&vf->list); kfree(vf); } } } mlx4_bitmap_cleanup(&priv->counters_table.bitmap); } } mutex_unlock(&priv->counters_table.mutex); } int __mlx4_slave_counters_free(struct mlx4_dev *dev, int slave) { struct mlx4_priv *priv = mlx4_priv(dev); int i, first; struct counter_index *vf, *tmp_vf; /* clean VF's counters for the next useg */ if (slave > 0 && slave <= dev->num_vfs) { mlx4_dbg(dev, "%s: free counters of slave(%d)\n" , __func__, slave); mutex_lock(&priv->counters_table.mutex); for (i = 0; i < dev->caps.num_ports; i++) { first = 0; list_for_each_entry_safe(vf, tmp_vf, &priv->counters_table.vf_list[slave - 1][i], list) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, vf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, vf->index); if (first++ && vf->index != MLX4_SINK_COUNTER_INDEX) { mlx4_dbg(dev, "%s: delete counter index %d for slave %d and port %d\n" , __func__, vf->index, slave, i + 1); mlx4_bitmap_free(&priv->counters_table.bitmap, vf->index, MLX4_USE_RR); list_del(&vf->list); kfree(vf); } else { mlx4_dbg(dev, "%s: can't delete default counter index %d for slave %d and port %d\n" , __func__, vf->index, slave, i + 1); } } } mutex_unlock(&priv->counters_table.mutex); } return 0; } int __mlx4_counter_alloc(struct mlx4_dev *dev, int slave, int port, u32 *idx) { struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *new_counter_index; if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)) return -ENOENT; if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) || (port < 0) || (port > MLX4_MAX_PORTS)) { mlx4_dbg(dev, "%s: invalid slave(%d) or port(%d) index\n", __func__, slave, port); return -EINVAL; } /* handle old guest request does not support request by port index */ if (port == 0) { *idx = MLX4_SINK_COUNTER_INDEX; mlx4_dbg(dev, "%s: allocated default counter index %d for slave %d port %d\n" , __func__, *idx, slave, port); return 0; } mutex_lock(&priv->counters_table.mutex); *idx = mlx4_bitmap_alloc(&priv->counters_table.bitmap); /* if no resources return the default counter of the slave and port */ if (*idx == -1) { if (slave == 0) { /* its the ethernet counter ?????? */ new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next, struct counter_index, list); } else { new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next, struct counter_index, list); } *idx = new_counter_index->index; mlx4_dbg(dev, "%s: allocated defualt counter index %d for slave %d port %d\n" , __func__, *idx, slave, port); goto out; } if (slave == 0) { /* native or master */ new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) goto no_mem; new_counter_index->index = *idx; list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]); } else { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) goto no_mem; new_counter_index->index = *idx; list_add_tail(&new_counter_index->list, &priv->counters_table.vf_list[slave - 1][port - 1]); } mlx4_dbg(dev, "%s: allocated counter index %d for slave %d port %d\n" , __func__, *idx, slave, port); out: mutex_unlock(&priv->counters_table.mutex); return 0; no_mem: mlx4_bitmap_free(&priv->counters_table.bitmap, *idx, MLX4_USE_RR); mutex_unlock(&priv->counters_table.mutex); *idx = MLX4_SINK_COUNTER_INDEX; mlx4_dbg(dev, "%s: failed err (%d)\n" , __func__, -ENOMEM); return -ENOMEM; } int mlx4_counter_alloc(struct mlx4_dev *dev, u8 port, u32 *idx) { u64 out_param; int err; struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *new_counter_index, *c_index; if (mlx4_is_mfunc(dev)) { err = mlx4_cmd_imm(dev, 0, &out_param, ((u32) port) << 8 | (u32) RES_COUNTER, RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (!err) { *idx = get_param_l(&out_param); if (*idx == MLX4_SINK_COUNTER_INDEX) return -ENOSPC; mutex_lock(&priv->counters_table.mutex); c_index = list_entry(priv->counters_table.global_port_list[port - 1].next, struct counter_index, list); mutex_unlock(&priv->counters_table.mutex); if (c_index->index == *idx) return -EEXIST; if (mlx4_is_slave(dev)) { new_counter_index = kmalloc(sizeof(struct counter_index), GFP_KERNEL); if (!new_counter_index) { mlx4_counter_free(dev, port, *idx); return -ENOMEM; } new_counter_index->index = *idx; mutex_lock(&priv->counters_table.mutex); list_add_tail(&new_counter_index->list, &priv->counters_table.global_port_list[port - 1]); mutex_unlock(&priv->counters_table.mutex); mlx4_dbg(dev, "%s: allocated counter index %d for port %d\n" , __func__, *idx, port); } } return err; } return __mlx4_counter_alloc(dev, 0, port, idx); } EXPORT_SYMBOL_GPL(mlx4_counter_alloc); void __mlx4_counter_free(struct mlx4_dev *dev, int slave, int port, u32 idx) { /* check if native or slave and deletes acordingly */ struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *pf, *tmp_pf; struct counter_index *vf, *tmp_vf; int first; if (idx == MLX4_SINK_COUNTER_INDEX) { mlx4_dbg(dev, "%s: try to delete default counter index %d for port %d\n" , __func__, idx, port); return; } if ((slave > MLX4_MAX_NUM_VF) || (slave < 0) || (port < 0) || (port > MLX4_MAX_PORTS)) { mlx4_warn(dev, "%s: deletion failed due to invalid slave(%d) or port(%d) index\n" , __func__, slave, idx); return; } mutex_lock(&priv->counters_table.mutex); if (slave == 0) { first = 0; list_for_each_entry_safe(pf, tmp_pf, &priv->counters_table.global_port_list[port - 1], list) { /* the first 2 counters are reserved */ if (pf->index == idx) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, pf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, pf->index); if (1 < first && idx != MLX4_SINK_COUNTER_INDEX) { list_del(&pf->list); kfree(pf); mlx4_dbg(dev, "%s: delete counter index %d for native device (%d) port %d\n" , __func__, idx, slave, port); mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR); goto out; } else { mlx4_dbg(dev, "%s: can't delete default counter index %d for native device (%d) port %d\n" , __func__, idx, slave, port); goto out; } } first++; } mlx4_dbg(dev, "%s: can't delete counter index %d for native device (%d) port %d\n" , __func__, idx, slave, port); } else { first = 0; list_for_each_entry_safe(vf, tmp_vf, &priv->counters_table.vf_list[slave - 1][port - 1], list) { /* the first element is reserved */ if (vf->index == idx) { /* clear the counter statistic */ if (__mlx4_clear_if_stat(dev, vf->index)) mlx4_dbg(dev, "%s: reset counter %d failed\n", __func__, vf->index); if (first) { list_del(&vf->list); kfree(vf); mlx4_dbg(dev, "%s: delete counter index %d for slave %d port %d\n", __func__, idx, slave, port); mlx4_bitmap_free(&priv->counters_table.bitmap, idx, MLX4_USE_RR); goto out; } else { mlx4_dbg(dev, "%s: can't delete default slave (%d) counter index %d for port %d\n" , __func__, slave, idx, port); goto out; } } first++; } mlx4_dbg(dev, "%s: can't delete slave (%d) counter index %d for port %d\n" , __func__, slave, idx, port); } out: mutex_unlock(&priv->counters_table.mutex); } void mlx4_counter_free(struct mlx4_dev *dev, u8 port, u32 idx) { u64 in_param = 0; struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *counter, *tmp_counter; int first = 0; if (mlx4_is_mfunc(dev)) { set_param_l(&in_param, idx); mlx4_cmd(dev, in_param, ((u32) port) << 8 | (u32) RES_COUNTER, RES_OP_RESERVE, MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (mlx4_is_slave(dev) && idx != MLX4_SINK_COUNTER_INDEX) { mutex_lock(&priv->counters_table.mutex); list_for_each_entry_safe(counter, tmp_counter, &priv->counters_table.global_port_list[port - 1], list) { if (counter->index == idx && first++) { list_del(&counter->list); kfree(counter); mlx4_dbg(dev, "%s: delete counter index %d for port %d\n" , __func__, idx, port); mutex_unlock(&priv->counters_table.mutex); return; } } mutex_unlock(&priv->counters_table.mutex); } return; } __mlx4_counter_free(dev, 0, port, idx); } EXPORT_SYMBOL_GPL(mlx4_counter_free); int __mlx4_clear_if_stat(struct mlx4_dev *dev, u8 counter_index) { struct mlx4_cmd_mailbox *if_stat_mailbox = NULL; int err = 0; u32 if_stat_in_mod = (counter_index & 0xff) | (1 << 31); if (counter_index == MLX4_SINK_COUNTER_INDEX) return -EINVAL; if (mlx4_is_slave(dev)) return 0; if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(if_stat_mailbox)) { err = PTR_ERR(if_stat_mailbox); return err; } err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0, MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev, if_stat_mailbox); return err; } u8 mlx4_get_default_counter_index(struct mlx4_dev *dev, int slave, int port) { struct mlx4_priv *priv = mlx4_priv(dev); struct counter_index *new_counter_index; if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) { mlx4_dbg(dev, "%s: return counter index %d for slave %d port (MLX4_PORT_TYPE_IB) %d\n", __func__, MLX4_SINK_COUNTER_INDEX, slave, port); return (u8)MLX4_SINK_COUNTER_INDEX; } mutex_lock(&priv->counters_table.mutex); if (slave == 0) { new_counter_index = list_entry(priv->counters_table.global_port_list[port - 1].next, struct counter_index, list); } else { new_counter_index = list_entry(priv->counters_table.vf_list[slave - 1][port - 1].next, struct counter_index, list); } mutex_unlock(&priv->counters_table.mutex); mlx4_dbg(dev, "%s: return counter index %d for slave %d port %d\n", __func__, new_counter_index->index, slave, port); return (u8)new_counter_index->index; } int mlx4_get_vport_ethtool_stats(struct mlx4_dev *dev, int port, struct mlx4_en_vport_stats *vport_stats, int reset) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *if_stat_mailbox = NULL; union mlx4_counter *counter; int err = 0; u32 if_stat_in_mod; struct counter_index *vport, *tmp_vport; if (!vport_stats) return -EINVAL; if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(if_stat_mailbox)) { err = PTR_ERR(if_stat_mailbox); return err; } mutex_lock(&priv->counters_table.mutex); list_for_each_entry_safe(vport, tmp_vport, &priv->counters_table.global_port_list[port - 1], list) { if (vport->index == MLX4_SINK_COUNTER_INDEX) continue; memset(if_stat_mailbox->buf, 0, sizeof(union mlx4_counter)); if_stat_in_mod = (vport->index & 0xff) | ((reset & 1) << 31); err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0, MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); if (err) { mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n", __func__, vport->index); goto if_stat_out; } counter = (union mlx4_counter *)if_stat_mailbox->buf; if ((counter->control.cnt_mode & 0xf) == 1) { vport_stats->rx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastFrames); vport_stats->rx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxUnicastFrames); vport_stats->rx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfRxMulticastFrames); vport_stats->tx_broadcast_packets += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastFrames); vport_stats->tx_unicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxUnicastFrames); vport_stats->tx_multicast_packets += be64_to_cpu(counter->ext.counters[0].IfTxMulticastFrames); vport_stats->rx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxBroadcastOctets); vport_stats->rx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxUnicastOctets); vport_stats->rx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfRxMulticastOctets); vport_stats->tx_broadcast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxBroadcastOctets); vport_stats->tx_unicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxUnicastOctets); vport_stats->tx_multicast_bytes += be64_to_cpu(counter->ext.counters[0].IfTxMulticastOctets); vport_stats->rx_errors += be64_to_cpu(counter->ext.counters[0].IfRxErrorFrames); vport_stats->rx_dropped += be64_to_cpu(counter->ext.counters[0].IfRxNoBufferFrames); vport_stats->tx_errors += be64_to_cpu(counter->ext.counters[0].IfTxDroppedFrames); } } if_stat_out: mutex_unlock(&priv->counters_table.mutex); mlx4_free_cmd_mailbox(dev, if_stat_mailbox); return err; } EXPORT_SYMBOL_GPL(mlx4_get_vport_ethtool_stats); static int mlx4_setup_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int err; int port; __be32 ib_port_default_caps; err = mlx4_init_uar_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "user access region table (err=%d), aborting.\n", err); return err; } err = mlx4_uar_alloc(dev, &priv->driver_uar); if (err) { mlx4_err(dev, "Failed to allocate driver access region " "(err=%d), aborting.\n", err); goto err_uar_table_free; } priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!priv->kar) { mlx4_err(dev, "Couldn't map kernel access region, " "aborting.\n"); err = -ENOMEM; goto err_uar_free; } err = mlx4_init_pd_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "protection domain table (err=%d), aborting.\n", err); goto err_kar_unmap; } err = mlx4_init_xrcd_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "reliable connection domain table (err=%d), " "aborting.\n", err); goto err_pd_table_free; } err = mlx4_init_mr_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "memory region table (err=%d), aborting.\n", err); goto err_xrcd_table_free; } if (!mlx4_is_slave(dev)) { err = mlx4_init_mcg_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "multicast group table (err=%d), aborting.\n", err); goto err_mr_table_free; } } err = mlx4_init_eq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "event queue table (err=%d), aborting.\n", err); goto err_mcg_table_free; } err = mlx4_cmd_use_events(dev); if (err) { mlx4_err(dev, "Failed to switch to event-driven " "firmware commands (err=%d), aborting.\n", err); goto err_eq_table_free; } err = mlx4_NOP(dev); if (err) { if (dev->flags & MLX4_FLAG_MSI_X) { mlx4_warn(dev, "NOP command failed to generate MSI-X " "interrupt IRQ %d).\n", priv->eq_table.eq[dev->caps.num_comp_vectors].irq); mlx4_warn(dev, "Trying again without MSI-X.\n"); } else { mlx4_err(dev, "NOP command failed to generate interrupt " "(IRQ %d), aborting.\n", priv->eq_table.eq[dev->caps.num_comp_vectors].irq); mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n"); } goto err_cmd_poll; } mlx4_dbg(dev, "NOP command IRQ test passed\n"); err = mlx4_init_cq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "completion queue table (err=%d), aborting.\n", err); goto err_cmd_poll; } err = mlx4_init_srq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "shared receive queue table (err=%d), aborting.\n", err); goto err_cq_table_free; } err = mlx4_init_qp_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "queue pair table (err=%d), aborting.\n", err); goto err_srq_table_free; } err = mlx4_init_counters_table(dev); if (err && err != -ENOENT) { mlx4_err(dev, "Failed to initialize counters table (err=%d), " "aborting.\n", err); goto err_qp_table_free; } if (!mlx4_is_slave(dev)) { for (port = 1; port <= dev->caps.num_ports; port++) { ib_port_default_caps = 0; err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps); if (err) mlx4_warn(dev, "failed to get port %d default " "ib capabilities (%d). Continuing " "with caps = 0\n", port, err); dev->caps.ib_port_def_cap[port] = ib_port_default_caps; /* initialize per-slave default ib port capabilities */ if (mlx4_is_master(dev)) { int i; for (i = 0; i < dev->num_slaves; i++) { if (i == mlx4_master_func_num(dev)) continue; priv->mfunc.master.slave_state[i].ib_cap_mask[port] = ib_port_default_caps; } } dev->caps.port_ib_mtu[port] = IB_MTU_4096; err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ? dev->caps.pkey_table_len[port] : -1); if (err) { mlx4_err(dev, "Failed to set port %d (err=%d), " "aborting\n", port, err); goto err_counters_table_free; } } } return 0; err_counters_table_free: mlx4_cleanup_counters_table(dev); err_qp_table_free: mlx4_cleanup_qp_table(dev); err_srq_table_free: mlx4_cleanup_srq_table(dev); err_cq_table_free: mlx4_cleanup_cq_table(dev); err_cmd_poll: mlx4_cmd_use_polling(dev); err_eq_table_free: mlx4_cleanup_eq_table(dev); err_mcg_table_free: if (!mlx4_is_slave(dev)) mlx4_cleanup_mcg_table(dev); err_mr_table_free: mlx4_cleanup_mr_table(dev); err_xrcd_table_free: mlx4_cleanup_xrcd_table(dev); err_pd_table_free: mlx4_cleanup_pd_table(dev); err_kar_unmap: iounmap(priv->kar); err_uar_free: mlx4_uar_free(dev, &priv->driver_uar); err_uar_table_free: mlx4_cleanup_uar_table(dev); return err; } static void mlx4_enable_msi_x(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct msix_entry *entries; int nreq = min_t(int, dev->caps.num_ports * min_t(int, num_possible_cpus() + 1, MAX_MSIX_P_PORT) + MSIX_LEGACY_SZ, MAX_MSIX); int err; int i; if (msi_x) { nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs, nreq); if (msi_x > 1 && !mlx4_is_mfunc(dev)) nreq = min_t(int, nreq, msi_x); entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); if (!entries) goto no_msi; for (i = 0; i < nreq; ++i) entries[i].entry = i; retry: err = pci_enable_msix(dev->pdev, entries, nreq); if (err) { /* Try again if at least 2 vectors are available */ if (err > 1) { mlx4_info(dev, "Requested %d vectors, " "but only %d MSI-X vectors available, " "trying again\n", nreq, err); nreq = err; goto retry; } kfree(entries); goto no_msi; } if (nreq < MSIX_LEGACY_SZ + dev->caps.num_ports * MIN_MSIX_P_PORT) { /*Working in legacy mode , all EQ's shared*/ dev->caps.comp_pool = 0; dev->caps.num_comp_vectors = nreq - 1; } else { dev->caps.comp_pool = nreq - MSIX_LEGACY_SZ; dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1; } for (i = 0; i < nreq; ++i) priv->eq_table.eq[i].irq = entries[i].vector; dev->flags |= MLX4_FLAG_MSI_X; kfree(entries); return; } no_msi: dev->caps.num_comp_vectors = 1; dev->caps.comp_pool = 0; for (i = 0; i < 2; ++i) priv->eq_table.eq[i].irq = dev->pdev->irq; } static int mlx4_init_port_info(struct mlx4_dev *dev, int port) { struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; int err = 0; info->dev = dev; info->port = port; if (!mlx4_is_slave(dev)) { mlx4_init_mac_table(dev, &info->mac_table); mlx4_init_vlan_table(dev, &info->vlan_table); info->base_qpn = mlx4_get_base_qpn(dev, port); } sprintf(info->dev_name, "mlx4_port%d", port); info->port_attr.attr.name = info->dev_name; if (mlx4_is_mfunc(dev)) info->port_attr.attr.mode = S_IRUGO; else { info->port_attr.attr.mode = S_IRUGO | S_IWUSR; info->port_attr.store = set_port_type; } info->port_attr.show = show_port_type; sysfs_attr_init(&info->port_attr.attr); err = device_create_file(&dev->pdev->dev, &info->port_attr); if (err) { mlx4_err(dev, "Failed to create file for port %d\n", port); info->port = -1; } sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port); info->port_mtu_attr.attr.name = info->dev_mtu_name; if (mlx4_is_mfunc(dev)) info->port_mtu_attr.attr.mode = S_IRUGO; else { info->port_mtu_attr.attr.mode = S_IRUGO | S_IWUSR; info->port_mtu_attr.store = set_port_ib_mtu; } info->port_mtu_attr.show = show_port_ib_mtu; sysfs_attr_init(&info->port_mtu_attr.attr); err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr); if (err) { mlx4_err(dev, "Failed to create mtu file for port %d\n", port); device_remove_file(&info->dev->pdev->dev, &info->port_attr); info->port = -1; } return err; } static void mlx4_cleanup_port_info(struct mlx4_port_info *info) { if (info->port < 0) return; device_remove_file(&info->dev->pdev->dev, &info->port_attr); device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr); } static int mlx4_init_steering(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int num_entries = dev->caps.num_ports; int i, j; priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL); if (!priv->steer) return -ENOMEM; for (i = 0; i < num_entries; i++) for (j = 0; j < MLX4_NUM_STEERS; j++) { INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]); INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]); } return 0; } static void mlx4_clear_steering(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_steer_index *entry, *tmp_entry; struct mlx4_promisc_qp *pqp, *tmp_pqp; int num_entries = dev->caps.num_ports; int i, j; for (i = 0; i < num_entries; i++) { for (j = 0; j < MLX4_NUM_STEERS; j++) { list_for_each_entry_safe(pqp, tmp_pqp, &priv->steer[i].promisc_qps[j], list) { list_del(&pqp->list); kfree(pqp); } list_for_each_entry_safe(entry, tmp_entry, &priv->steer[i].steer_entries[j], list) { list_del(&entry->list); list_for_each_entry_safe(pqp, tmp_pqp, &entry->duplicates, list) { list_del(&pqp->list); kfree(pqp); } kfree(entry); } } } kfree(priv->steer); } static int extended_func_num(struct pci_dev *pdev) { return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn); } #define MLX4_OWNER_BASE 0x8069c #define MLX4_OWNER_SIZE 4 static int mlx4_get_ownership(struct mlx4_dev *dev) { void __iomem *owner; u32 ret; if (pci_channel_offline(dev->pdev)) return -EIO; owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { mlx4_err(dev, "Failed to obtain ownership bit\n"); return -ENOMEM; } ret = readl(owner); iounmap(owner); return (int) !!ret; } static void mlx4_free_ownership(struct mlx4_dev *dev) { void __iomem *owner; if (pci_channel_offline(dev->pdev)) return; owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, MLX4_OWNER_SIZE); if (!owner) { mlx4_err(dev, "Failed to obtain ownership bit\n"); return; } writel(0, owner); msleep(1000); iounmap(owner); } static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) { struct mlx4_priv *priv; struct mlx4_dev *dev; int err; int port; int nvfs, prb_vf; pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); err = pci_enable_device(pdev); if (err) { dev_err(&pdev->dev, "Cannot enable PCI device, " "aborting.\n"); return err; } mlx4_get_val(num_vfs.dbdf2val.tbl, pci_physfn(pdev), 0, &nvfs); mlx4_get_val(probe_vf.dbdf2val.tbl, pci_physfn(pdev), 0, &prb_vf); if (nvfs > MLX4_MAX_NUM_VF) { dev_err(&pdev->dev, "There are more VF's (%d) than allowed(%d)\n", nvfs, MLX4_MAX_NUM_VF); return -EINVAL; } if (nvfs < 0) { dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n"); return -EINVAL; } /* * Check for BARs. */ if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) && !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { dev_err(&pdev->dev, "Missing DCS, aborting." "(driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%x)\n", pci_dev_data, pci_resource_flags(pdev, 0)); err = -ENODEV; goto err_disable_pdev; } if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { dev_err(&pdev->dev, "Missing UAR, aborting.\n"); err = -ENODEV; goto err_disable_pdev; } err = pci_request_regions(pdev, DRV_NAME); if (err) { dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); goto err_disable_pdev; } pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n"); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); goto err_release_regions; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) { dev_warn(&pdev->dev, "Warning: couldn't set 64-bit " "consistent PCI DMA mask.\n"); err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); goto err_release_regions; } } /* Allow large DMA segments, up to the firmware limit of 1 GB */ dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); priv = kzalloc(sizeof *priv, GFP_KERNEL); if (!priv) { dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; goto err_release_regions; } dev = &priv->dev; dev->pdev = pdev; INIT_LIST_HEAD(&priv->dev_list); INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); mutex_init(&priv->port_mutex); INIT_LIST_HEAD(&priv->pgdir_list); mutex_init(&priv->pgdir_mutex); INIT_LIST_HEAD(&priv->bf_list); mutex_init(&priv->bf_mutex); dev->rev_id = pdev->revision; dev->numa_node = dev_to_node(&pdev->dev); /* Detect if this device is a virtual function */ if (pci_dev_data & MLX4_PCI_DEV_IS_VF) { /* When acting as pf, we normally skip vfs unless explicitly * requested to probe them. */ if (nvfs && extended_func_num(pdev) > prb_vf) { mlx4_warn(dev, "Skipping virtual function:%d\n", extended_func_num(pdev)); err = -ENODEV; goto err_free_dev; } mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); dev->flags |= MLX4_FLAG_SLAVE; } else { /* We reset the device and enable SRIOV only for physical * devices. Try to claim ownership on the device; * if already taken, skip -- do not allow multiple PFs */ err = mlx4_get_ownership(dev); if (err) { if (err < 0) goto err_free_dev; else { mlx4_warn(dev, "Multiple PFs not yet supported." " Skipping PF.\n"); err = -EINVAL; goto err_free_dev; } } if (nvfs) { mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", nvfs); err = pci_enable_sriov(pdev, nvfs); if (err) { mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d).\n", err); err = 0; } else { mlx4_warn(dev, "Running in master mode\n"); dev->flags |= MLX4_FLAG_SRIOV | MLX4_FLAG_MASTER; dev->num_vfs = nvfs; } } atomic_set(&priv->opreq_count, 0); INIT_WORK(&priv->opreq_task, mlx4_opreq_action); /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left * the HCA in an undefined state. */ err = mlx4_reset(dev); if (err) { mlx4_err(dev, "Failed to reset HCA, aborting.\n"); goto err_sriov; } } slave_start: err = mlx4_cmd_init(dev); if (err) { mlx4_err(dev, "Failed to init command interface, aborting.\n"); goto err_sriov; } /* In slave functions, the communication channel must be initialized * before posting commands. Also, init num_slaves before calling * mlx4_init_hca */ if (mlx4_is_mfunc(dev)) { if (mlx4_is_master(dev)) dev->num_slaves = MLX4_MAX_NUM_SLAVES; else { dev->num_slaves = 0; err = mlx4_multi_func_init(dev); if (err) { mlx4_err(dev, "Failed to init slave mfunc" " interface, aborting.\n"); goto err_cmd; } } } err = mlx4_init_hca(dev); if (err) { if (err == -EACCES) { /* Not primary Physical function * Running in slave mode */ mlx4_cmd_cleanup(dev); dev->flags |= MLX4_FLAG_SLAVE; dev->flags &= ~MLX4_FLAG_MASTER; goto slave_start; } else goto err_mfunc; } /* In master functions, the communication channel must be initialized * after obtaining its address from fw */ if (mlx4_is_master(dev)) { err = mlx4_multi_func_init(dev); if (err) { mlx4_err(dev, "Failed to init master mfunc" "interface, aborting.\n"); goto err_close; } } err = mlx4_alloc_eq_table(dev); if (err) goto err_master_mfunc; priv->msix_ctl.pool_bm = 0; mutex_init(&priv->msix_ctl.pool_lock); mlx4_enable_msi_x(dev); if ((mlx4_is_mfunc(dev)) && !(dev->flags & MLX4_FLAG_MSI_X)) { err = -ENOSYS; mlx4_err(dev, "INTx is not supported in multi-function mode." " aborting.\n"); goto err_free_eq; } if (!mlx4_is_slave(dev)) { err = mlx4_init_steering(dev); if (err) goto err_free_eq; } err = mlx4_setup_hca(dev); if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) && !mlx4_is_mfunc(dev)) { dev->flags &= ~MLX4_FLAG_MSI_X; dev->caps.num_comp_vectors = 1; dev->caps.comp_pool = 0; pci_disable_msix(pdev); err = mlx4_setup_hca(dev); } if (err) goto err_steer; mlx4_init_quotas(dev); for (port = 1; port <= dev->caps.num_ports; port++) { err = mlx4_init_port_info(dev, port); if (err) goto err_port; } err = mlx4_register_device(dev); if (err) goto err_port; mlx4_request_modules(dev); mlx4_sense_init(dev); mlx4_start_sense(dev); priv->pci_dev_data = pci_dev_data; pci_set_drvdata(pdev, dev); return 0; err_port: for (--port; port >= 1; --port) mlx4_cleanup_port_info(&priv->port[port]); mlx4_cleanup_counters_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); mlx4_cleanup_cq_table(dev); mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mcg_table(dev); mlx4_cleanup_mr_table(dev); mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); mlx4_cleanup_uar_table(dev); err_steer: if (!mlx4_is_slave(dev)) mlx4_clear_steering(dev); err_free_eq: mlx4_free_eq_table(dev); err_master_mfunc: if (mlx4_is_master(dev)) { mlx4_free_resource_tracker(dev, RES_TR_FREE_STRUCTS_ONLY); mlx4_multi_func_cleanup(dev); } if (mlx4_is_slave(dev)) { kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); } err_close: if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); mlx4_close_hca(dev); err_mfunc: if (mlx4_is_slave(dev)) mlx4_multi_func_cleanup(dev); err_cmd: mlx4_cmd_cleanup(dev); err_sriov: if (dev->flags & MLX4_FLAG_SRIOV) pci_disable_sriov(pdev); if (!mlx4_is_slave(dev)) mlx4_free_ownership(dev); err_free_dev: kfree(priv); err_release_regions: pci_release_regions(pdev); err_disable_pdev: pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); return err; } static int __devinit mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { printk_once(KERN_INFO "%s", mlx4_version); return __mlx4_init_one(pdev, id->driver_data); } static void mlx4_remove_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); int p; if (dev) { /* in SRIOV it is not allowed to unload the pf's * driver while there are alive vf's */ if (mlx4_is_master(dev)) { if (mlx4_how_many_lives_vf(dev)) mlx4_err(dev, "Removing PF when there are assigned VF's !!!\n"); } mlx4_stop_sense(dev); mlx4_unregister_device(dev); for (p = 1; p <= dev->caps.num_ports; p++) { mlx4_cleanup_port_info(&priv->port[p]); mlx4_CLOSE_PORT(dev, p); } if (mlx4_is_master(dev)) mlx4_free_resource_tracker(dev, RES_TR_FREE_SLAVES_ONLY); mlx4_cleanup_counters_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); mlx4_cleanup_cq_table(dev); mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mcg_table(dev); mlx4_cleanup_mr_table(dev); mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); if (mlx4_is_master(dev)) mlx4_free_resource_tracker(dev, RES_TR_FREE_STRUCTS_ONLY); iounmap(priv->kar); mlx4_uar_free(dev, &priv->driver_uar); mlx4_cleanup_uar_table(dev); if (!mlx4_is_slave(dev)) mlx4_clear_steering(dev); mlx4_free_eq_table(dev); if (mlx4_is_master(dev)) mlx4_multi_func_cleanup(dev); mlx4_close_hca(dev); if (mlx4_is_slave(dev)) mlx4_multi_func_cleanup(dev); mlx4_cmd_cleanup(dev); if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); if (dev->flags & MLX4_FLAG_SRIOV) { mlx4_warn(dev, "Disabling SR-IOV\n"); pci_disable_sriov(pdev); } if (!mlx4_is_slave(dev)) mlx4_free_ownership(dev); kfree(dev->caps.qp0_tunnel); kfree(dev->caps.qp0_proxy); kfree(dev->caps.qp1_tunnel); kfree(dev->caps.qp1_proxy); kfree(priv); pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } } static int restore_current_port_types(struct mlx4_dev *dev, enum mlx4_port_type *types, enum mlx4_port_type *poss_types) { struct mlx4_priv *priv = mlx4_priv(dev); int err, i; mlx4_stop_sense(dev); mutex_lock(&priv->port_mutex); for (i = 0; i < dev->caps.num_ports; i++) dev->caps.possible_type[i + 1] = poss_types[i]; err = mlx4_change_port_types(dev, types); mlx4_start_sense(dev); mutex_unlock(&priv->port_mutex); return err; } int mlx4_restart_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); enum mlx4_port_type curr_type[MLX4_MAX_PORTS]; enum mlx4_port_type poss_type[MLX4_MAX_PORTS]; int pci_dev_data, err, i; pci_dev_data = priv->pci_dev_data; for (i = 0; i < dev->caps.num_ports; i++) { curr_type[i] = dev->caps.port_type[i + 1]; poss_type[i] = dev->caps.possible_type[i + 1]; } mlx4_remove_one(pdev); err = __mlx4_init_one(pdev, pci_dev_data); if (err) return err; dev = pci_get_drvdata(pdev); err = restore_current_port_types(dev, curr_type, poss_type); if (err) mlx4_err(dev, "mlx4_restart_one: could not restore original port types (%d)\n", err); return 0; } static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = { /* MT25408 "Hermon" SDR */ { PCI_VDEVICE(MELLANOX, 0x6340), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" DDR */ { PCI_VDEVICE(MELLANOX, 0x634a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" QDR */ { PCI_VDEVICE(MELLANOX, 0x6354), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" DDR PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x6732), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" QDR PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x673c), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" EN 10GigE */ { PCI_VDEVICE(MELLANOX, 0x6368), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x6750), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ { PCI_VDEVICE(MELLANOX, 0x6372), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ { PCI_VDEVICE(MELLANOX, 0x675a), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT26468 ConnectX EN 10GigE PCIe gen2*/ { PCI_VDEVICE(MELLANOX, 0x6764), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ { PCI_VDEVICE(MELLANOX, 0x6746), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT26478 ConnectX2 40GigE PCIe gen2 */ { PCI_VDEVICE(MELLANOX, 0x676e), MLX4_PCI_DEV_FORCE_SENSE_PORT }, /* MT25400 Family [ConnectX-2 Virtual Function] */ { PCI_VDEVICE(MELLANOX, 0x1002), MLX4_PCI_DEV_IS_VF }, /* MT27500 Family [ConnectX-3] */ { PCI_VDEVICE(MELLANOX, 0x1003), 0 }, /* MT27500 Family [ConnectX-3 Virtual Function] */ { PCI_VDEVICE(MELLANOX, 0x1004), MLX4_PCI_DEV_IS_VF }, { PCI_VDEVICE(MELLANOX, 0x1005), 0 }, /* MT27510 Family */ { PCI_VDEVICE(MELLANOX, 0x1006), 0 }, /* MT27511 Family */ { PCI_VDEVICE(MELLANOX, 0x1007), 0 }, /* MT27520 Family */ { PCI_VDEVICE(MELLANOX, 0x1008), 0 }, /* MT27521 Family */ { PCI_VDEVICE(MELLANOX, 0x1009), 0 }, /* MT27530 Family */ { PCI_VDEVICE(MELLANOX, 0x100a), 0 }, /* MT27531 Family */ { PCI_VDEVICE(MELLANOX, 0x100b), 0 }, /* MT27540 Family */ { PCI_VDEVICE(MELLANOX, 0x100c), 0 }, /* MT27541 Family */ { PCI_VDEVICE(MELLANOX, 0x100d), 0 }, /* MT27550 Family */ { PCI_VDEVICE(MELLANOX, 0x100e), 0 }, /* MT27551 Family */ { PCI_VDEVICE(MELLANOX, 0x100f), 0 }, /* MT27560 Family */ { PCI_VDEVICE(MELLANOX, 0x1010), 0 }, /* MT27561 Family */ { 0, } }; MODULE_DEVICE_TABLE(pci, mlx4_pci_table); static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { mlx4_remove_one(pdev); return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; } static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) { int ret = __mlx4_init_one(pdev, 0); return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; } static const struct pci_error_handlers mlx4_err_handler = { .error_detected = mlx4_pci_err_detected, .slot_reset = mlx4_pci_slot_reset, }; static int suspend(struct pci_dev *pdev, pm_message_t state) { mlx4_remove_one(pdev); return 0; } static int resume(struct pci_dev *pdev) { return __mlx4_init_one(pdev, 0); } static struct pci_driver mlx4_driver = { .name = DRV_NAME, .id_table = mlx4_pci_table, .probe = mlx4_init_one, .remove = __devexit_p(mlx4_remove_one), .suspend = suspend, .resume = resume, .err_handler = &mlx4_err_handler, }; static int __init mlx4_verify_params(void) { int status; status = update_defaults(&port_type_array); if (status == INVALID_STR) { if (mlx4_fill_dbdf2val_tbl(&port_type_array.dbdf2val)) return -1; } else if (status == INVALID_DATA) { return -1; } status = update_defaults(&num_vfs); if (status == INVALID_STR) { if (mlx4_fill_dbdf2val_tbl(&num_vfs.dbdf2val)) return -1; } else if (status == INVALID_DATA) { return -1; } status = update_defaults(&probe_vf); if (status == INVALID_STR) { if (mlx4_fill_dbdf2val_tbl(&probe_vf.dbdf2val)) return -1; } else if (status == INVALID_DATA) { return -1; } if (msi_x < 0) { pr_warn("mlx4_core: bad msi_x: %d\n", msi_x); return -1; } if ((log_num_mac < 0) || (log_num_mac > 7)) { pr_warning("mlx4_core: bad num_mac: %d\n", log_num_mac); return -1; } if (log_num_vlan != 0) pr_warning("mlx4_core: log_num_vlan - obsolete module param, using %d\n", MLX4_LOG_NUM_VLANS); if (mlx4_set_4k_mtu != -1) pr_warning("mlx4_core: set_4k_mtu - obsolete module param\n"); if ((log_mtts_per_seg < 0) || (log_mtts_per_seg > 7)) { pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); return -1; } if (mlx4_log_num_mgm_entry_size != -1 && (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE || mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) { pr_warning("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not " "in legal range (-1 or %d..%d)\n", mlx4_log_num_mgm_entry_size, MLX4_MIN_MGM_LOG_ENTRY_SIZE, MLX4_MAX_MGM_LOG_ENTRY_SIZE); return -1; } if (mod_param_profile.num_qp < 18 || mod_param_profile.num_qp > 23) { pr_warning("mlx4_core: bad log_num_qp: %d\n", mod_param_profile.num_qp); return -1; } if (mod_param_profile.num_srq < 10) { pr_warning("mlx4_core: too low log_num_srq: %d\n", mod_param_profile.num_srq); return -1; } if (mod_param_profile.num_cq < 10) { pr_warning("mlx4_core: too low log_num_cq: %d\n", mod_param_profile.num_cq); return -1; } if (mod_param_profile.num_mpt < 10) { pr_warning("mlx4_core: too low log_num_mpt: %d\n", mod_param_profile.num_mpt); return -1; } if (mod_param_profile.num_mtt_segs && mod_param_profile.num_mtt_segs < 15) { pr_warning("mlx4_core: too low log_num_mtt: %d\n", mod_param_profile.num_mtt_segs); return -1; } if (mod_param_profile.num_mtt_segs > MLX4_MAX_LOG_NUM_MTT) { pr_warning("mlx4_core: too high log_num_mtt: %d\n", mod_param_profile.num_mtt_segs); return -1; } return 0; } static int __init mlx4_init(void) { int ret; if (mlx4_verify_params()) return -EINVAL; mlx4_catas_init(); mlx4_wq = create_singlethread_workqueue("mlx4"); if (!mlx4_wq) return -ENOMEM; if (enable_sys_tune) sys_tune_init(); ret = pci_register_driver(&mlx4_driver); if (ret < 0) goto err; return 0; err: if (enable_sys_tune) sys_tune_fini(); destroy_workqueue(mlx4_wq); return ret; } static void __exit mlx4_cleanup(void) { if (enable_sys_tune) sys_tune_fini(); pci_unregister_driver(&mlx4_driver); destroy_workqueue(mlx4_wq); } module_init_order(mlx4_init, SI_ORDER_MIDDLE); module_exit(mlx4_cleanup); -#include static int mlx4_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlx4_mod = { .name = "mlx4", .evhand = mlx4_evhand, }; MODULE_VERSION(mlx4, 1); DECLARE_MODULE(mlx4, mlx4_mod, SI_SUB_OFED_PREINIT, SI_ORDER_ANY); MODULE_DEPEND(mlx4, linuxapi, 1, 1, 1); Index: head/sys/ofed/include/linux/module.h =================================================================== --- head/sys/ofed/include/linux/module.h (revision 277401) +++ head/sys/ofed/include/linux/module.h (revision 277402) @@ -1,99 +1,99 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _LINUX_MODULE_H_ #define _LINUX_MODULE_H_ +#include +#include +#include + #include #include #include #include #include #define MODULE_AUTHOR(name) #define MODULE_DESCRIPTION(name) #define MODULE_LICENSE(name) - -#ifndef MODULE_VERSION -#define MODULE_VERSION(name) -#endif #define THIS_MODULE ((struct module *)0) #define EXPORT_SYMBOL(name) #define EXPORT_SYMBOL_GPL(name) /* OFED pre-module initialization */ #define SI_SUB_OFED_PREINIT (SI_SUB_ROOT_CONF - 2) /* OFED default module initialization */ #define SI_SUB_OFED_MODINIT (SI_SUB_ROOT_CONF - 1) #include static inline void _module_run(void *arg) { void (*fn)(void); #ifdef OFED_DEBUG_INIT char name[1024]; caddr_t pc; long offset; pc = (caddr_t)arg; if (linker_search_symbol_name(pc, name, sizeof(name), &offset) != 0) printf("Running ??? (%p)\n", pc); else printf("Running %s (%p)\n", name, pc); #endif fn = arg; DROP_GIANT(); fn(); PICKUP_GIANT(); } #define module_init(fn) \ SYSINIT(fn, SI_SUB_OFED_MODINIT, SI_ORDER_FIRST, _module_run, (fn)) #define module_exit(fn) \ SYSUNINIT(fn, SI_SUB_OFED_MODINIT, SI_ORDER_SECOND, _module_run, (fn)) /* * The following two macros are a workaround for not having a module * load and unload order resolver: */ #define module_init_order(fn, order) \ SYSINIT(fn, SI_SUB_OFED_MODINIT, (order), _module_run, (fn)) #define module_exit_order(fn, order) \ SYSUNINIT(fn, SI_SUB_OFED_MODINIT, (order), _module_run, (fn)) #define module_get(module) #define module_put(module) #define try_module_get(module) 1 #endif /* _LINUX_MODULE_H_ */