Index: contrib/ofed/include/infiniband/Makefile =================================================================== --- contrib/ofed/include/infiniband/Makefile +++ contrib/ofed/include/infiniband/Makefile @@ -8,6 +8,7 @@ VERBINCS= ${IBVERBS}/arch.h ${IBVERBS}/driver.h ${IBVERBS}/kern-abi.h VERBINCS+= ${IBVERBS}/marshall.h ${IBVERBS}/opcode.h VERBINCS+= ${IBVERBS}/sa-kern-abi.h ${IBVERBS}/sa.h ${IBVERBS}/verbs.h +VERBINCS+= ${IBVERBS}/verbs_exp.h ${IBVERBS}/ofa_verbs.h VERBINCSDIR= ${INCLUDEDIR}/infiniband IBCOMMON= ${.CURDIR}/../../management/libibcommon/include/infiniband Index: contrib/ofed/libibverbs/AUTHORS =================================================================== --- contrib/ofed/libibverbs/AUTHORS +++ contrib/ofed/libibverbs/AUTHORS @@ -1,4 +1,4 @@ Roland Dreier -Dotan Barak +Dotan Barak Sean Hefty Michael S. Tsirkin Index: contrib/ofed/libibverbs/ChangeLog =================================================================== --- contrib/ofed/libibverbs/ChangeLog +++ contrib/ofed/libibverbs/ChangeLog @@ -1,583 +1,3108 @@ -2006-10-30 Jack Morgenstein - - * src/cmd.c (ibv_cmd_query_qp): Unmarshall sq_draining instead of - en_sqd_async_notify. +Vladimir Sokolovsky 2013-05-01 + + Roll 1.1.6mlnx2 release + + + +Yishai Hadas 2013-05-26 + + libibverbs: XRC sample, fix message size + + Fix size to prevent memory overflow + + +Igor Ivanov 2013-04-30 + + tests: Add unit-tests for Cross-channel API + + Added unit-tests based on gtest test environment to test verbs api. + Tests related Cross-channel functionality are included. + + +Yishai Hadas 2013-05-20 + + examples: Add examples for Cross-channel API + + Added examples using Cross-channel verbs API and adapted to use new + verbs extension API. + ibv_cc_task_pp is based on TASKs usage + ibv_cc_pingpong demonstrates wait opcode usage + + Added manuals into man folder for these two examples. + + +Igor Ivanov 2013-04-30 + + man: Add manuals for new verbs api + + Added manuals for: + inserted ibv_create_qp_ex() into ibv_create_qp man; + inserted ibv_query_device_ex() into ibv_query_device man; + created new manuals for following functions as ibv_modify_cq, ibv_post_task + + +Yishai Hadas 2013-05-19 + + libibverbs: Add Cross-channel capability + + Added ibv_post_task. + + This code enables the hardware feature referred as Cross-channel. The + functionality supports the posting of a task list to a Host Channel Adapter (HCA), + with a single completion queue entry indicating when the task list has completed. + The task list may have entries posted to multiple Queue Pairs (QPs) and includes + send and receive communication primitives, as well as the communication + coordination primitives wait, send_enable, and receive_enable. Since a single post + of tasks to QPs initiates the communication pattern, and all progress is handled + by the HCA. + + ibv_post_task - post a list of send/recv tasks (TAKSs) to QPs + + It is added using verbs extensions approach. + + +Igor Ivanov 2013-03-14 + + libibverbs: Added new flag IB_DEVICE_CROSS_CHANNEL into enum ib_device_cap_flags + + This flag demonstrates presence of Cross-channel capability in a device. + + The device can be programmed to execute IO operation flows that involve + synchronization of IO operations on different IO channels as well as conditional + execution based on arithmetic operations on memory-resident variables. This + capability enables to program complex IO operation flows with a single function + call, hereby significantly reducing overhead associated with IO processing. + + +Yishai Hadas 2013-05-19 + + libibverbs: Add ibv_query_device_ex + + It is added using verbs extensions approach and extend abilities of existing + function such as ibv_query_device. This code adds new struct verbs_device_attr + that extends existing struct ibv_device_attr with new field as + struct ibv_device_calc_cap calc_cap. + This code does not add new uverbs call as ibv_cmd_query_device_ex to process + IB_USER_VERBS_CMD_QUERY_DEVICE_EX command. + + +Yishai Hadas 2013-05-19 + + libibverbs: Add ibv_modify_cq + + This code adds new uverbs call as ibv_cmd_modify_cq to process + IB_USER_VERBS_CMD_MODIFY_CQ_EX command. It is added using verbs extensions + approach. + + +Yishai Hadas 2013-05-19 + + libibverbs: Support Cross-channel capability in ibv_create_qp_ex + + This code adds new uverbs call as ibv_cmd_create_qp_ex to process + IB_USER_VERBS_CMD_CREATE_QP_EX command. It is added using verbs extensions + approach and extend abilities of existing functions such as ibv_create_qp. + + + +Yishai Hadas 2013-05-19 + + libibverbs: XRC open flags + + Add missing mode in open system call + + +Yishai Hadas 2013-05-19 + + libibverbs: Add XRC sample source file + + +Yishai Hadas 2013-05-12 + + libibverbs: Add XRC sample application + + +Sean Hefty 2012-09-19 + + libibverbs: Add man page for ibv_open_qp + + +Yishai Hadas 2013-05-12 + + libibverbs: Add ibv_open_qp + + XRC receive QPs are shareable across multiple processes. Allow + any process with access to the xrc domain to open an existing + QP. After opening the QP, the process will receive events + related to the QP and be able to modify the QP. + + +Yishai Hadas 2013-05-12 + + libibverbs: Add support for XRC QPs + + XRC queue pairs: xrc defines two new types of QPs. The + initiator, or send-side, xrc qp behaves similar to a send- + only RC qp. xrc send qp's are managed through the existing + QP functions. The send_wr structure is extended in a back- + wards compatible way to support posting sends on a send xrc + qp, which require specifying the remote xrc srq. + + The target, or receive-side, xrc qp behaves differently + than other implemented qp's. A recv xrc qp can be created, + modified, and destroyed like other qp's through the existing + calls. The qp_init_attr structure is extended for xrc qp's. + + Because xrc recv qp's are bound to an xrcd, rather than a pd, + it is intended to be used among multiple processes. Any process + with access to an xrcd may allocate and connect an xrc recv qp. + The actual xrc recv qp is allocated and managed by the kernel. + If the owning process explicit destroys the xrc recv qp, it is + destroyed. However, if the xrc recv qp is left open when the + user process exits or closes its device, then the lifetime of + the xrc recv qp is bound with the lifetime of the xrcd. + + +Yishai Hadas 2013-05-12 + + livibverbs: Add support for XRC SRQs + + XRC support requires the use of a new type of SRQ. + XRC shared receive queues: xrc srq's are similar to normal + srq's, except that they are bound to an xrcd, rather + than to a protection domain. Based on the current spec + and implementation, they are only usable with xrc qps. To + support xrc srq's, we define a new srq_init_attr structure + to include an srq type and other needed information. + + The kernel ABI is also updated to allow creating extended + SRQs. + + +Yishai Hadas 2013-05-07 + + libibverbs: Introduce XRC domains + + XRC introduces several new concepts and structures, one of + which is the XRC domain. + + XRC domains: xrcd's are a type of protection domain used to + associate shared receive queues with xrc queue pairs. Since + xrcd are meant to be shared among multiple processes, we + introduce new APIs to open/close xrcd's. + + The user to kernel ABI is extended to account for opening/ + closing the xrcd. + + +Eli Cohen 2013-04-09 + + Remove old APIs definitions + + This patch adds changes from Sean's libibverbs patch 1 v5 that were not taken + already to ofed's libibverbs. + + +Hadar Hen Zion 2013-02-11 + + Implement ibv_create_flow and ibv_destroy_flow + + According to the new flow steering verbs API and using the extended verbs + mechanism. + ibv_create_flow verb allow user space applications to attach flow + specifications for a QP. ibv_flow_attr is a pointer for flow specifications + structs that contain mandatory control parameters and optional L2, L3 and L4 + headers. + ibv_flow_attr struct is a mandatory control struct: + + struct ibv_flow_attr { +uint32_t comp_mask; +enum ibv_flow_attr_type type; +uint16_t size; +uint16_t priority; +uint8_t num_of_specs; +uint8_t port; +uint32_t flags; +/* Following are the optional layers according to user request + * struct ibv_flow_spec_xxx + * struct ibv_flow_spec_yyy + */ + }; + + And can be followed by the optional flow headers structs: + struct ibv_flow_spec_ib + struct ibv_flow_spec_eth + struct ibv_flow_spec_ipv4 + struct ibv_flow_spec_tcp_udp + + ib_flow_attr pointer includes ibv_flow_attr and headers structs, which can be + detected by reading the size and num_of_specs fields in ib_flow_attr struct. + + The user can choose flow type according to the following enum: + enum ibv_flow_attr_type { +/* steering according to rule specifications */ +IBV_FLOW_ATTR_NORMAL = 0x0, +/* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ +IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, +* default multicast rule - +* receive all Eth multicast traffic which isn't steered to any QP +*/ +IBV_FLOW_ATTR_MC_DEFAULT = 0x2, +/* sniffer rule - receive all port traffic */ +IBV_FLOW_ATTR_MIRROR = 0x3, + }; + + When setting flow type to NORMAL, the incoming traffic will be steer + according to the rule specifications. ALL_DEFAULT and MC_DEFAULT rules options + are valid only for Ethernet link type since IB link type packets always include + QP number. + + To detach the flow from the QP, ibv_destroy_flow is called. + ibv_destroy_flow requires struct ibv_flow which contains a uobject handle. + + +Hadar Hen Zion 2013-03-05 + + Add general definitions to support verbs extensions + + To support verbs extension mechanism between user space libraries and + uverbs kernel module, each new added verb which is based on verbs extension + approach should include the following: + 1. Enum index greater than IB_USER_VERBS_CMD_THRESHOLD. + 2. Additional common fields in the kernel-user struct: +__u16 provider_in_words; +__u16 provider_out_words; +__u32 cmd_hdr_reserved; +__u32 comp_mask; + + The above fields will be initialized by + IBV_INIT_CMD_EX/IBV_INIT_CMD_RESP_EX new macros. + + Added uverbs_cmd_type to recognize if command should be processed as a legacy or as an extended. + Added new header support macros. + + +Hadar Hen Zion 2013-02-11 + + Revert "Implement ibv_attach_flow and ibv_detach_flow." + + This reverts commit fbcdd10fdb19c2957208772a555f937f423a7bb6. + + +Hadar Hen Zion 2013-02-11 + + Revert "Pad ibv_attach_flow struct to avoid alignment problems" + + This reverts commit a93bb4980321b53921e08266247f9287540df0c2. + + +Hadar Hen Zion 2013-02-11 + + Revert "Expose rule_type field in ibv_flow_spec struct" + + This reverts commit d15371a7500944bb0e2e700c1748a31172623e24. + + +Yishai Hadas 2013-02-03 + + shared_mr: example program adapted to use new verbs extension API + + +Yishai Hadas 2013-01-31 + + shared_mr support on top on verbs extenstion + + Adds shared_mr API based on new verbs extension mechanism. + This patch also includes a compatible API to previous one used in 1.8.X. + + +Marcel Apfelbaum 2012-12-25 + + libibverbs: Infrastructure to support verbs extensions + + Based on a patch from Yishai Hadas + + Infrastructure to support extended verbs capabilities in a forward/backward + manner. + + Support for extensions is determined by the provider calling + verbs_register_driver in place of ibv_register_driver. When + extensions are enabled, ibverbs sets the current alloc_context / + free_context device operations to NULL. These are used to + indicate that the struct ibv_device may be cast to struct + verbs_device. + + With extensions, ibverbs allocates the ibv_context structure + and calls into the provider to initialize it. The init call + is part of the verbs_device struct. + + +Marcel Apfelbaum 2012-12-25 + + Revert "verbs extension mechanism based on Sean first patch" + + This reverts commit 5fc421bc824d4d105e641d81ea1d7602b5ce07b5. + + To be replaced by new verbs extensions mechansim submitte to community. + + +Vladimir Sokolovsky 2013-01-07 + + Roll 1.1.6mlnx1 release + + +Hadar Hen Zion 2012-11-05 + + Expose rule_type field in ibv_flow_spec struct + + Needed for flow steering ibdump implementation + + Issue:98828 - * include/infiniband/kern-abi.h: Change en_sqd_async_notify member - of struct ibv_query_qp_resp to sq_draining. +Dotan Barak 2012-10-23 -2006-10-30 Roland Dreier + shared_mr: Fix memory leak in case of an error + + Squash with 'shared mr sample program' + + Issue 94266. + + Reviewed-by: Yishai Hadas - * src/init.c (find_drivers): Make find_drivers() take a const - directory name, and tweak how we strip trailing /s so that we - don't have to modify the directory name passed in. Constify - default_path too. +Yishai Hadas 2012-09-30 -2006-10-25 Roland Dreier + ibv_shared_mr - fix to work over 2GB + + move to use size_t instead of int + - * src/init.c (init_drivers): Remove assignment to dev->driver now - that it is gone for good. +Yishai Hadas 2012-08-28 - * include/infiniband/verbs.h: Remove .driver member of struct - ibv_device, since it is never really used. + man page for ibv_shared_mr + -2006-10-17 Roland Dreier +Dotan Barak 2012-07-25 - * include/infiniband/arch.h: Update i386 and x86_64 memory barrier - macros to be more than compiler barriers, to guard against - out-of-order speculative reads. + man: remove the limitation of creating QP with SRQ only for UD and RC QPs + - * include/infiniband/arch.h: Add rmb() and wmb() macros in - addition to the full mb(), so that low-level drivers can ask for - weaker ordering if that's all that is needed. - -2006-10-03 Roland Dreier +Dotan Barak 2012-07-25 - * src/cmd.c (ibv_cmd_get_context_v2, ibv_cmd_get_context) - (ibv_cmd_query_device, ibv_cmd_query_port, ibv_cmd_alloc_pd) - (ibv_cmd_reg_mr, ibv_cmd_create_cq_v2, ibv_cmd_create_cq) - (ibv_cmd_poll_cq, ibv_cmd_resize_cq, ibv_cmd_destroy_cq) - (ibv_cmd_create_srq, ibv_cmd_create_qp, ibv_cmd_post_send) - (ibv_cmd_post_recv, ibv_cmd_post_srq_recv, ibv_cmd_create_ah) - (ibv_cmd_destroy_qp): Annotate so that Valgrind knows responses - are defined after write() succeeds. The kernel writes into the - response structure directly, so without these, Valgrind thinks - that response structures are undefined memory. This is based on - patches and suggestions by Rainer Keller , Jeff - Squyres and Siqing Fan. + man: added IBV_QPT_RAW_PACKET usage + - * src/ibverbs.h: Add wrapper for VALGRIND_MAKE_MEM_DEFINED so that - it can be used in .c files without worrying about whether Valgrind - is installed or enabled. +Dotan Barak 2012-07-25 - * configure.in: Add support for Valgrind annotation (enabled with - --with-valgrind option to configure). + Revert "man: added IBV_QPT_RAW_PACKET usage" + + Squash this patch with "man: added IBV_QPT_RAW_PACKET usage" + + This reverts commit 5222e9091cf96ba7e096b5ed78a00c879563ce78. + - * src/cmd.c (ibv_cmd_query_port, ibv_cmd_create_cq, - ibv_cmd_modify_qp): Set reserved fields to 0 to avoid future - problems and also to make Valgrind a little quieter. +Merge: fe21442 477be24 +Vladimir Sokolovsky 2012-06-27 - * src/init.c (init_drivers): Set node_type and transport_type - values of device being created. + Merge branch 'mlnx_ofed_1_8' into mlnx_ofed_2_0 - * include/infiniband/verbs.h: Add ibv_node_type enum value - IBV_NODE_RNIC, and add enum ibv_transport_type. Add node_type and - transport_type fields to struct ibv_device. +Dotan Barak 2012-06-27 -2006-09-12 Roland Dreier + IB/libibverbs: Expose the "request for checksum" in send flags + + Reviewed-by: Moni Shoua - * include/infiniband/verbs.h: Swap wr_id and next members of - struct ibv_send_wr and struct ibv_recv_wr. This allows wr_id to - be naturally aligned without padding on 32-bit platforms. +Dotan Barak 2012-06-27 -2006-08-23 Roland Dreier + Keep IBV_QPT_RAW_ETH for backward compatibility + + libnes and others still use IBV_QPT_RAW_ETH + - * include/infiniband/driver.h: Add a definition of the macro - IBV_CMD_RESIZE_CQ_HAS_RESP_PARAMS so that low-level driver plugins - can detect the changed signature of ibv_cmd_resize_cq(). +Dotan Barak 2012-06-27 -2006-08-23 Ralph Campbell + man: added IBV_QPT_RAW_PACKET usage + - * src/cmd.c (ibv_cmd_resize_cq): Add resp and resp_size parameters - so that the low-level driver in the kernel can return - device-specific information from the resize CQ operation. +Dotan Barak 2012-06-27 -2006-07-26 Roland Dreier + libibverbs: add raw ethernet QP type IBV_QPT_RAW_PACKET=7 + + The patch enables usage of L2 raw ethernet QP type for user-space + applications. + - * src/verbs.c (ibv_reg_mr, ibv_dereg_mr): Add calls to - ibv_dontfork_range() and ibv_dofork_range() for memory regions - registered by library consumers. +Yishai Hadas 2012-06-26 - * include/infiniband/verbs.h: Add declaration of ibv_fork_init(). + reg_shared_mr: adding "no rdma" mode + + Reviewed-by: Shachar Raindel - * include/infiniband/driver.h: Add declarations of - ibv_dontfork_range() and ibv_dofork_range(). +Nir Muchtar 2011-11-03 - * src/memory.c: Rewrite to use a red-black tree instead of a - linked list. Change from doing mlock()/munlock() to - madvise(..., MADV_DONTFORK) and madvise(..., MADV_DOFORK), and - change the name of the entry points to ibv_dontfork_range() and - ibv_dofork_range(). Add ibv_fork_init() for applications to - request fork-safe behavior. + Pad ibv_attach_flow struct to avoid alignment problems + - * src/ibverbs.h: Kill off unused declarations. +Nir Muchtar 2011-10-02 - * src/init.c (ibverbs_init): Get rid of call to ibv_init_mem_map(). + Implement ibv_attach_flow and ibv_detach_flow. + + ibv_attach_flow are new verbs that allow to attach + a QP to the specified flow spec. + Flow specifications are described using ibv_flow_spec + structs. + - * include/infiniband/verbs.h: Add addr and length field to struct - ibv_mr so that memory regions can be madvised(). This changes the - ABI, since the layout of struct ibv_mr is changed. +Dotan Barak 2012-05-08 -2006-07-04 Roland Dreier + libibverbs: clarifying the environment variable values in ibv_fork_init.3 + + We got a feedback that this sentence is vague and it isn't clear enough what is + the expected value in the environment variables. + - * include/infiniband/arch.h: Fix typo in sparc mb() - implementation: the asm should just be empty -- the "sync" - instruction was mistakenly cut and pasted from the ppc version. +Yishai Hadas 2012-05-07 -2006-06-07 Sean Hefty + shared mr sample program + rc_pingpong improvements for contiguous mode + - * src/verbs.c include/infiniband/verbs.h: Add new routines: - ibv_init_ah_from_wc() and ibv_create_ah_from_wc() to simplify UD QP - communication. +Yishai Hadas 2012-05-07 - * src/marshall.c include/infiniband/marshall.h: Expose - ibv_copy_ah_attr_from_kern to retrieve ibv_ah_attr from kernel for - a UD QP. + man page update for contiguous mode and shared_mr + -2006-06-01 Roland Dreier +Yishai Hadas 2012-05-07 - * src/device.c (ibv_get_device_list): Actually return a - NULL-terminated array as the documentation promises. + shared mr - improve fork handling + -2006-05-31 Roland Dreier +Dotan Barak 2012-04-29 - * src/init.c (find_drivers): Fix memory leak: the result of - asprintf() needs to be freed when we're done with it. + fix resource leaks in pingpong examples in case of a failure + - * examples/asyncwatch.c (event_name_str): Print human-readable - form of IBV_EVENT_CLIENT_REREGISTER. +Dotan Barak 2012-04-25 -2006-05-31 Leonid Arsh + Added the man page verbs.7 + + Added the man page verbs.7 which is an introduction to libibverbs man + pages. + + Reviewed-by: Bart Van Assche - * include/infiniband/verbs.h: Add IBV_EVENT_CLIENT_REREGISTER. +Dotan Barak 2012-04-25 -2006-05-22 Roland Dreier + Add new InfiniBand link speeds + + Introduce support for the following extended speeds: + + FDR: IBA extended speed 14.0625 Gbps. + EDR: IBA extended speed 25.78125 Gbps. + + Reviewed-by: Hal Rosenstock - * examples/devinfo.c (print_hca_cap): Read board_id attribute from - sysfs using ibv_read_sysfs_file() instead of libsysfs. +Dotan Barak 2012-04-09 - * src/cmd.c, src/marshall.c, src/sysfs.c: Include , - since it is no longer implicitly included via . + man: fix typo in ibv_get_device_guid.3 + - * include/infiniband/driver.h, include/infiniband/verbs.h, - src/device.c, src/init.c, src/verbs.c: Remove dependency on - libsysfs by implementing what is required directly on top of - filesystem operations. +Dotan Barak 2012-04-24 - * include/infiniband/driver.h, src/init.c: Change name of driver - entry point to ibv_driver_init(), and update prototype to remove - libsysfs dependency. + Rename the attribute private -> private_data + + Since "private" is a reserved word in C++, it shouldn't be used as a name of a + structure attribute. + + Reviewed-by: Jack Morgenstein - * src/marshall.c, include/infiniband/marshall.h, - include/infiniband/sa.h: Remove deprecated ib_xxx symbols. +Yishai Hadas 2012-04-17 - * Makefile.am: Bump SONAME to 2, since libibverbs 1.1 will be - ABI-incompatible with libibverbs 1.0. + expose register shared mr capabilities. + + new access flag for reg_mr indicating that mr should be created shared with a given permissions. + - user,group,other read/write. + new verb named ibv_reg_shared_mr to be used for registering a new mr based on shared mr_id. + + +Yishai Hadas 2012-04-17 + + verbs extension mechanism based on Sean first patch + + Register driver extension API to indicate that driver supports the new extension mechanism. + To be used by low level drivers (e.g. libmlx4) + Enable checking whether given driver/device instance supports the extension mechanism. + Mechanism to get an extension operations based on a given extension name. + Based on Sean patch from http://marc.info/?l=linux-rdma&m=130714769030036&w=2 + + +Yishai Hadas 2012-04-15 + + libibverbs: rc_pingpong example using contiguous mr + + Add an option via command line indicating to use contiguous mode + Make relevant changes for that mode + + +Yishai Hadas 2012-04-15 + + libibverbs: Expose memory region allocator mode to applications + + Exposing a new capability bit indicating that system supported the new allocation mode. + Recognition of new mode in libibverbs and acting as below: + NULL expected as input address. + + The allocated address is hooked on the mr->address for further use by the application. + Flow change for dofork/dontfork. + + +Yishai Hadas 2012-04-15 + + libibverbs: Adding allocator access flag for memory region registration + + Extend ibv_access_flags with an extra bit named IBV_ACCESS_ALLOCATE_MR + to ask for new allocator mode. + + +Jack Morgenstein 2012-04-15 + + For RoCE, the default packet size in ud_pingpong should not exceed the I/F MTU + + Without this change, ibv_ud_pingpong under RoCE hangs unless the -s parameter + is explicitly provided in the command line (to be 1430 or less). + + +Roland Dreier 2011-12-21 + + Roll libibverbs 1.1.6 release + + +Dotan Barak 2011-11-14 + + Fix memory leaks in various error flows + + +Marcel Apfelbaum 2011-10-03 + + Add support to ibv_devinfo for displaying extended speeds + + Add code to ibv_devinfo to display the following new speeds: + + 8: FDR-10 is a proprietary link speed which is 10.3125 Gbps with 64b/66b + encoding rather than 8b/10b encoding. + 16: FDR - 14.0625 Gbps + 32: EDR - 25.78125 Gbps + + Reviewed-by: Hal Rosenstock + +Roland Dreier 2011-11-11 + + Debian: Don't use brace expansion for {a,so} in libibverbs-dev.install + + +Bart Van Assche 2011-08-07 + + Makefile.am: Fix an automake warning + + Fix the following automake warning message: + + Makefile.am:1: `INCLUDES' is the old name for `AM_CPPFLAGS' (or `*_CPPFLAGS') + + A quote from the automake manual: + + INCLUDES + This does the same job as AM_CPPFLAGS (or any per-target _CPPFLAGS variable + if it is used). It is an older name for the same functionality. This + variable is deprecated; we suggest using AM_CPPFLAGS and per-target + _CPPFLAGS instead. + + +Bart Van Assche 2011-08-07 + + Add "foreign" option to AM_INIT_AUTOMAKE + + Switch to the modern form of the AM_INIT_AUTOMAKE macro and tell + automake that the libibverbs package does not follow the GNU + standards. This change makes it possible to use 'autoreconf' for the + libibverbs package. + + +Or Gerlitz 2011-07-19 + + Update examples for IBoE + + Since IBoE requires usage of GRH, update ibv_*_pinpong examples to + accept GIDs. GIDs are given as an index to the local port's table and + are exchanged between the client and the server through the socket + connection. + + +Or Gerlitz 2011-07-20 + + Add GID change event + + Add handling for GID change events, which are generated by the kernel + IBoE stack when the HW driver updates the GID table. + + +Or Gerlitz 2011-07-19 + + Update kernel API header to include link_layer + + Modify the code to handle returning the link layer of a port from the + kernel to the library. The kernel has done this since commit + 2420b60b1dc4 ("IB/uverbs: Return link layer type to userspace for + query port operation"), merged in 2.6.37-rc1. + + The new field does not change the size of struct ibv_query_port_resp + as it replaces a reserved field. Binary compatibility between the + kernel to the library is kept, since old kernels running below new + library will not zero that field, so it will be read as "unspecified," + while an old library running over new kernel will ignore the value + returned by the kernel. + + The solution was suggested by Roland Dreier + and Jason Gunthorpe + + +Or Gerlitz 2011-07-20 + + Add link_layer field port attribute + + The new field has three possible values: IBV_LINK_LAYER_UNSPECIFIED, + IBV_LINK_LAYER_INFINIBAND, IBV_LINK_LAYER_ETHERNET. It can be used by + applications to know the link layer used by the port, which can be + either InfiniBand or Ethernet. + + The addition of the new field does not change the size of struct + ibv_port_attr due to alignment of the preceding fields. Binary + compatibility between the library to applications is kept, since old + apps running over new library do not read this field, and new apps + running over old library will determine the link layer as unspecified + and hence take their IB code path. + + The solution was suggested by Roland Dreier + and Jason Gunthorpe + + +Roland Dreier 2011-06-29 + + Update Debian Standards-Version to 3.9.2 + + +Roland Dreier 2011-06-29 + + Don't ship .la files in Debian libibverbs-dev + + +Roland Dreier 2011-06-29 + + Package description shouldn't start with an article + + +Julien BLACHE 2011-06-29 + + debian/libibverbs1.postinst: Let debhelper code run in all cases + + Do not exit postinst if not configuring -- code added by debhelper needs + to run in all cases, not only the configure case. + + +Roland Dreier 2011-06-28 + + Roll libibverbs 1.1.5 release + + +Alexander Schmidt 2010-08-20 + + Handle huge pages in ibv_fork_init() and madvise tracking + + When fork support is enabled in libibverbs, madvise() is called for + every memory page that is registered as a memory region. Memory + ranges that are passed to madvise() must be page aligned and the size + must be a multiple of the page size. + + libibverbs uses sysconf(_SC_PAGESIZE) to find out the system page size + and rounds all ranges passed to reg_mr() according to this page size. + When memory from libhugetlbfs is passed to reg_mr(), this does not + work as the page size for this memory range might be different + (e.g. 16MB). So libibverbs would have to use the huge page size to + calculate a page aligned range for madvise. + + As huge pages are provided to the application "under the hood" when + preloading libhugetlbfs, the application does not have any knowledge + about when it registers a huge page or a usual page. + + To work around this issue, detect the use of huge pages in libibverbs + and align memory ranges passed to madvise according to the huge page + size. Determining the page size of a given memory range by watching + madvise() fail has proven to be unreliable. So we introduce the + RDMAV_HUGEPAGES_SAFE environment variable to let the user decide if + the page size should be checked on every reg_mr() call or not. This + requires the user to be aware if huge pages are used by the running + application or not. + + I did not add an aditional API call to enable this, as applications + can use setenv() + ibv_fork_init() to enable checking for huge pages + in the code. + + + [ Updated ibv_fork_init() manpage for RDMAV_HUGEPAGES_SAFE. - Roland ] + + +Roland Dreier 2011-06-27 + + Set DM-Upload-Allowed now that Roland is a Debian Maintainer + + +Dotan Barak 2011-06-15 + + Fix date format in RPM spec file changelog + + +Yann Droneaud 2011-05-31 + + read_config_file: ignore driver line without driver name + + If there's no driver name, strsep() will set config to NULL and later + processing of the driver name will segfault. + + Spotted with zzuf. + + +Roland Dreier 2011-05-27 + + Fix crash if no devices and ibv_get_device_list() is called multiple times + + If no devices are found, ibverbs_init() sets num_devices to 0. This + means the next call to __ibv_get_device_list() would call + ibverbs_init() again, which crashes because ibverbs_init() leaves + various internal pointers pointing to freed memory. + + Fix this by using pthread_once() to call ibverbs_init() exactly once, + and then doing the right thing even if num_devices stays 0. + + Tested-by: Yann Droneaud + +Tom Tucker 2010-07-29 + + Add AC_PROG_LIBTOOL to fix libtool configure warning + + Add AC_PROG_LIBTOOL to configure.in to fix an autogen.sh warning about + LIBTOOL configuration. + + +Jason Gunthorpe 2010-10-07 + + Fix autotools to include the necessary m4 files + + Running autogen.sh with a new version of autotools and then building + on a system with an older version tends to explode. Unfortunately + this is sometimes necessary since the new version is required by the + package. The fix changes the autogen.sh output from: + + + aclocal -I config + + libtoolize --force --copy + libtoolize: putting auxiliary files in AC_CONFIG_AUX_DIR, `config'. + libtoolize: copying file `config/ltmain.sh' + libtoolize: Consider adding `AC_CONFIG_MACRO_DIR([m4])' to configure.in and + libtoolize: rerunning libtoolize, to keep the correct libtool macros in-tree. + libtoolize: Consider adding `-I m4' to ACLOCAL_AMFLAGS in Makefile.am. + + autoheader + + automake --foreign --add-missing --copy + + autoconf + + to: + + + aclocal -I config + + libtoolize --force --copy + libtoolize: putting auxiliary files in AC_CONFIG_AUX_DIR, `config'. + libtoolize: copying file `config/ltmain.sh' + libtoolize: putting macros in AC_CONFIG_MACRO_DIR, `config'. + libtoolize: copying file `config/libtool.m4' + libtoolize: copying file `config/ltoptions.m4' + libtoolize: copying file `config/ltsugar.m4' + libtoolize: copying file `config/ltversion.m4' + libtoolize: copying file `config/lt~obsolete.m4' + + autoheader + + automake --foreign --add-missing --copy + + autoconf + + And fixes various build problems in weird cases. + + This is how GNU envisions this mess works at least... + + +Roland Dreier 2010-06-03 + + Roll libibverbs 1.1.4 release + + +Roland Dreier 2010-06-03 + + Small configure.in modernizations + + +Roland Dreier 2010-06-03 + + Debian: Switch to dpkg-source 3.0 format + + +Hakon Bugge 2010-06-02 + + Force line-buffering in ibv_asyncwatch + + ibv_asyncwatch defaults to block-buffering when stdout is redirected to + a file or pipe. Changing to line-buffered mode makes it more usable in + scripted environments. + + +Sean Hefty 2010-05-06 + + Add path record definitions to sa.h + + Add definitions for path record wire definition. This will be used by + the librdmacm and ib_acm service, and is exchanged with the kernel + using the newer set and query route functionality. + + +Alex Vainman 2010-03-28 + + Undo changes in memory range tree when madvise() fails + + ibv_madvise_range() doesn't cleanup if madvise() fails. This patch + rolls back changes already made in the memory range tracking tree by + madvise() calls before the one that failed. We can do this fairly + simply by simply restarting ibv_madvise_range() from the original + start to the current location with the opposite advice/inc values. + + +Alex Vainman 2010-02-01 + + Fix incorrect splits/merges in the memory tree when madvise() fails. + + ibv_madvise_range() first manages (splits or merges) memory ranges in + the tree and only then calls madvise(). If madvise() fails, the + tree's memory range may contain incorrectly split or merged ranges. + The patch undoes the split and merge operations performed on the node + which caused the madvise() failure as well as on that node's + neighbors. + + +Alex Vainman 2010-02-01 + + Increment node refcount in ibv_madvise_range() only if madvise() succeeds + + ibv_madvise_range() first updates the memory range reference count and + then calls to madvise(). If madvise() fails, the reference count of + the failed node is incorrect. Fix this by updating the node's + reference count only after a successful call to madvise() (or if no + call to madvise() was needed). + + +Alex Vainman 2010-02-01 + + Factor out range handling in ibv_madvise_range() + + Clean up some code in ibv_madvise_range() by adding functions + merge_ranges(), split_range() and get_start_node(). + + +Roland Dreier 2009-11-11 + + Update Debian build rules so -dbg package isn't empty + + Add an override_dh_strip target so that the -dbg package ends up with + actual debug information in it. This was broken in the dh7 transition. + + +Roland Dreier 2009-10-30 + + Use proper build dependency version in debian control file + + The debian rules use a override_dh_makeshlibs target, so (as lintian + points out) we need a build dependency on debhelper >= 7.0.50. + + +Roland Dreier 2009-10-29 - * Create libibverbs 1.1 branch and bump version number to 1.1-pre1. + Roll libibverbs 1.1.3 release + -2006-05-22 Michael S. Tsirkin +Roland Dreier 2009-10-29 - * include/infiniband/verbs.h: Remove trailing commas from - enumerators to quiet warnings from obsolete compilers. + Merge Fedora spec file changes + -2006-05-02 Roland Dreier +Roland Dreier 2009-10-29 - * Release version 1.0.3. + Rewrite GID output in ibv_devinfo to avoid type punning warnings + + Avoid casting from uint8_t* to uint16_t* and then dereferencing to avoid + warnings about type punning. + + +Jason Gunthorpe 2009-10-28 + + Return errors from ibv_get_device_list() via errno + + Get rid of the output to stderr on various failure cases from + ibv_get_device_list() such as no device driver found, so that + applications can control how to present errors. Fix up the examples + and the man page to match. + + Code expecting this behavior linking to old libibverbs will + get the old fprint and errno set to garbage (probably ESPIPE). + + +Jason Gunthorpe 2009-10-29 + + Fix double free in find_sysfs_devs() + + Fix double free of sysfs_dev in find_sysfs_devs if ibv_read_sysfs_file() + fails (which is unlikely in practice). + + Jason Gunthorpe -2006-05-01 Roland Dreier +Jeff Squyres 2009-10-28 - * include/infiniband/arch.h: Only SPARC V9 ISA supports membar. - So just use generic memory barrier for older sparc archs. + Show transport (IB vs. iWARP) type in ibv_devinfo + + +Robert Pearson 2009-09-14 + + Fix fall-through bug in options case in pingpong examples + + Add missing breaks for the 'm' case of options handling. + -2006-04-11 Roland Dreier +Jason Gunthorpe 2009-07-30 - * src/sysfs.c (ibv_read_sysfs_file): Fix memory leak if open fails. + Do not use enum types for bit flags + + Arithmetic operations on enum members do not result in the enum type; + C++ is stricter about this than C. So using flag enums results in + compile errors when they are OR'd together in a C++ application. + + To fix this, replace all flag enum objects with int. int was selected + to preserve the ABI; we checked that enum types are the same size as + int on at least i386, x86-64, ppc32, ppc64, ia64, and mips, and arm + and sparc also appear compatible with this choice. + - * src/device.c (ibv_get_device_guid), src/verbs.c (ibv_query_gid, - ibv_query_pkey), src/init.c (init_drivers, check_abi_version): Use - libibverbs functions instead of libsysfs functions to get to sysfs. +Roland Dreier 2009-09-02 - * src/sysfs.c (ibv_get_sysfs_path, ibv_read_sysfs_file): Add some - simple functions for accessing sysfs without using libsysfs. + Update Debian Standards-Version to 3.8.3 + - * include/infiniband/sa-kern-abi.h: Deprecate struct - ib_kern_path_rec name; struct ibv_kern_path_rec is now preferred. +Roland Dreier 2009-09-02 - * include/infiniband/sa.h: Deprecate struct ib_sa_XXX names; - struct ibv_sa_XXX is now preferred. + Change Debian build system from cdbs to debhelper 7 + + With debhelper 7 we can get just as simple a rules file without all of + the cdbs magic. + + +Roland Dreier 2009-08-24 - * src/marshall.c, include/infiniband/marshall.h: Deprecate - ib_copy_XXX() names; ibv_copy_XXX() is preferred. Add stub - wrappers with the old names so old binaries still work. + Use AC_GNU_SOURCE in configure.in instead of -D_GNU_SOURCE in Makefile + -2006-04-11 Hoang-Nam Nguyen +Roland Dreier 2009-08-24 - * src/verbs.c (ibv_rate_to_mult, mult_to_ibv_rate): Add new - functions to convert between IB rate enums and multiples of the - base 2.5 Gb/sec rate. + Enable quiet build rules with automake 1.11+ + + Conditionally use the new AM_SILENT_RULES macro in configure.in. + -2006-04-11 Roland Dreier +Roland Dreier 2009-08-24 - * include/infiniband/verbs.h: Add __attribute_const macro to - portably mark functions as __attribute__((const)) + Revert "Update build system to use shave" + + This reverts commit 25ade84d1cd0b8b3a68872d3fc195e88cc7c4211. Rather + than using shave, we'll use automake 1.11's native quiet build. + -2006-03-28 Roland Dreier +Jason Gunthorpe 2009-07-18 - * src/init.c (load_driver): Print warning if dlopen() of a driver - plugin fails. + Make the gid argument to ibv_attach_mcast and ibv_detach_mcast const + + ibv_attach_mcast() and ibv_detach_mcast() don't change the gid + argument, so the arguments should be const to allow applications to + pass in constant gids. This constness flows through to the driver + call struct and into the drivers and back into + ibv_cmd_attach_mcast()/ibv_cmd_detach_mcast(). + -2006-03-22 Dotan Barak +Jason Gunthorpe 2009-07-14 - * examples/asyncwatch.c: Print asynchronous event name as well as - raw integer value. - -2006-03-22 Roland Dreier + Allow config file paths to the driver library to be absolute + + If the driver line starts with a / then no lib prefix is applied and + the full path is passed to dlopen(). This allows a completely + self-contained installation that relies on RPATH for the binaries and + this mechanism for the drivers. + - * include/infiniband/verbs.h (ibv_req_notify_cq): Document - parameters better. +Merge: 25ade84 11f8931 +Roland Dreier 2009-06-25 -2006-03-16 Roland Dreier + Merge branch 'stable' - * src/cmd.c, src/device.c, src/memory.c, src/verbs.c: Add include - of to get a declaration of free() and avoid compile - warnings. +Roland Dreier 2009-06-25 -2006-03-14 Roland Dreier + Update Debian policy version to 3.8.2 + + None of the changes 3.7.3 -> 3.8.2 affect us. + - * Release version 1.0.2. +Roland Dreier 2009-06-24 - * Makefile.am (EXTRA_DIST): Remove debian/ directory from - tarballs, since Debian policy is that upstream tarballs should not - include it. + Move -dbg Debian package to section debug + -2006-03-13 Roland Dreier +Roland Dreier 2009-06-24 - * Release version 1.0.1. + Update build system to use shave + + Add shave (git://git.lespiau.name/shave) to make build output of libibverbs + much more readable by abbreviating the outputed commands so that + warnings become visible, etc. + - * src/init.c (check_abi_version), src/verbs.c (ibv_query_gid, - ibv_query_pkey): Use sysfs_open_attribute() and - sysfs_read_attribute() instead of the deprecated function - sysfs_read_attribute_value(), which is no longer present in - libsysfs2 (which is already in Debian and Ubuntu). +Dotan Barak 2008-10-18 - * Release version 1.0. + Update Dotan's email + + Update Dotan's email in all of the files it appears. + -2006-03-06 Roland Dreier +Shirley Ma 2008-07-22 - * include/infiniband/verbs.h: Add enum ibv_rate to define encoding - of static_rate field (based on a patch from Jack Morgenstein - ). + Implement PPC wmb() with sync instead of eieio + + wmb() for PPC was incorrect defined as an eieio instruction in + libibverbs. eieio only orders pure I/O memory or a pure system memory + accesses. In a situation where the device drivers use the d_map + kernel services to share a portion of system memory with an I/O + adapter, we need to use sync() instead. See below link for reference: + + http://www.ibm.com/developerworks/eserver/articles/powerpc.html + -2006-03-06 Ralph Campbell +Roland Dreier 2008-06-24 - * src/init.c (find_drivers): Fix minor memory leak: call - globfree() to free memory allocated by glob(). + Revert conversion of ibv_devinfo to use ibv_port_state_str() + + Using ibv_port_state_str() changes the port state output of ibv_devinfo + (eg "PORT_DOWN" becomes "down"), which is reported to break scripts that + parse this output. Revert to using the old code in ibv_devinfo; we want + ibv_port_state_str() to continue producing the nicer-looking lower case + output, so just leave the open-coded alternative in ibv_devinfo. + + Reported-by: Jack Morgenstein + -2006-02-23 Dotan Barak +Dotan Barak 2008-05-23 - * src/cmd.c (ibv_cmd_create_srq): Add support for kernel ABI - version 6 (take SRQ capacity from kernel response to create SRQ). + Code formatting cleanups + + Improve readability based on warnings from kernel's checkpatch.pl. + -2006-02-16 Roland Dreier +Roland Dreier 2008-04-21 - * Release version 1.0-rc7. + Update Debian packaging to include new manpage symlinks + - * src/cmd.c (ibv_cmd_create_qp): Add support for kernel ABI - version 5 (properly aligned struct ibv_create_qp_resp). +Roland Dreier 2008-04-18 -2006-02-15 Roland Dreier + Include NMU changes for version 1.1.1-1.1 in Debian changelog + - * src/cmd.c (ibv_cmd_create_qp): Allow userspace device-specific - driver to pass in a response buffer, so that the low-level driver - in the kernel can pass back device-specific information. This - changes the userspace driver API, since the signature of - ibv_cmd_create_qp() is changed. +Roland Dreier 2008-04-18 -2006-02-14 Roland Dreier + Change .nl macro in man pages to .sp + + There actually is no ".nl" macro defined in troff, so convert all uses + of it to ".sp", which seems to be what was intended. + - * Release version 1.0-rc6. +Roland Dreier 2008-04-18 -2006-02-13 Dotan Barak + Roll libibverbs 1.1.2 release + - * examples/devinfo.c (print_hca_cap): Print board_id from sysfs, - if present. +Roland Dreier 2008-04-18 + + Add RPM dependency on base package to -devel package + + This fixes the rpmlint warning + + libibverbs-devel.x86_64: W: no-dependency-on libibverbs + + +Roland Dreier 2008-04-18 + + Correct typo ibv_mult_to_rate -> mult_to_ibv_rate in man page + + +Ira Weiny 2008-04-15 + + Add functions to convert enum values to strings + + Add ibv_xxx_str() functions to convert node type, port state, event + type and wc status enum values to strings. + + +Roland Dreier 2008-04-18 + + Update various text to talk about general RDMA, not just InfiniBand + + libibverbs works with both iWARP and InfiniBand devices, so update + various places that talk about InfiniBand to be more general. + + +Roland Dreier 2008-03-30 + + Fix download directory in RPM spec file + + libibverbs sources are now in downloads/verbs/, not just downloads/ + + +Dotan Barak 2007-10-23 + + Add command line parameter to set SL for pingpong examples + + Add a --sl/-l command line parameter for the pingpong examples to set + the SL of the QP/AH. This can be used to test a QoS setup. + + +Roland Dreier 2008-03-12 + + Add debian/watch file + + +Troy Benjegerdes 2008-03-09 + + Fix valgrind false positive in ibv_create_comp_channel() + + Need to mark response buffer as defined after write() succeeds. + + +Or Gerlitz 2007-07-25 + + Document IBV_SEND_INLINE buffer ownership + + If the IBV_SEND_INLINE flag is set in a work request posted with + ibv_post_send(), the data buffers can be reused immediately after the + call returns. Document this. + + +Dotan Barak 2008-02-03 + + Fixes for man pages + + Some fixes and updates to several man pages: + * Correct formatting in a few places. + * Add more "SEE ALSO" functions where appropriate. + * Document byte order of GUID and P_Key fields. + * Fix example code in ibv_get_cq_event.3 + * Document GRH handling on receive. + + +Dotan Barak 2007-10-10 + + Fix some issues in the examples + + Fix the following issues reported by valgrind in the examples: + * memory leaks + * uninitialized members of attribute structures + + +Dotan Barak 2007-10-10 + + Fix several valgrind false positives + + Fix several issues that were reported by valgrind: + + * Initialize reserved attributes of command structures + + * Fix the pointer and size when calling VALGRIND_MAKE_MEM_DEFINED in + ibv_cmd_reg_mr() and ibv_cmd_create_cq_v2(): if we have struct + xxx_resp *resp and resp_size, we need to do + +VALGRIND_MAKE_MEM_DEFINED(resp, resp_size) + + rather than the getting the paramters wrong as in + +VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp) +VALGRIND_MAKE_MEM_DEFINED(resp, sizeof resp_size); + + * Call VALGRIND_MAKE_MEM_DEFINED for buffers that are filled by + the kernel in ibv_cmd_query_srq(), ibv_cmd_destroy_srq() and + ibv_cmd_query_qp(). + + +Roland Dreier 2008-02-12 + + Put correct version information in Debian shlibs + + Use DEB_DH_MAKESHLIBS_ARGS_ALL to pass appropriate -V option to + dh_makeshlibs, since new symbols were added in version 1.1.0. + + +Roland Dreier 2008-01-23 + + Convert hyphen to minus sign in ibv_query_pkey man page + + A bare "-" in a man page will be rendered as a hyphen; to get a minus + sign, "\-" must be used. Very pedantic people (or automatic checkers, + such as Debian's lintian tool) may notice the difference. The man page + for ibv_query_pkey incorrectly wrote a negative return value as "-1". + Fix this to be the correct "\-1". + + +Roland Dreier 2008-01-21 + + Use real Homepage: tag instead of pseudo-header inside description + + New dpkg can actually parse Homepage: fields in debian/control. + + +Roland Dreier 2007-12-03 + + Update Debian policy version to 3.7.3 + + None of the changes 3.7.2 -> 3.7.3 affect us. + + +Roland Dreier 2007-11-20 + + Always return valid bad_wr on error from ibv_post_{send,recv,srq_recv} + + There are error cases in the kernel's uverbs work request posting + functions where the return value is negative (i.e., an error) and yet a + non-zero resp.bad_wr is not written back to userspace. In this case, + ibv_cmd_post_send() should still set the bad_wr pointer. + + Bug pointed out in ibv_post_send() by Ralph Campbell + , and noticed elsewhere by Dotan Barak + . + + +Roland Dreier 2007-10-26 + + Fix spec file License: tag + + Our license information is properly described as "GPLv2 or BSD". + + +swelch@systemfabricworks.com 2007-08-31 + + Set ibv_device->node_type when allocating device + + When allocating a device structure, set the node_type member correctly. + + +Dotan Barak 2007-08-08 + + Initialize reserved attributes in modify QP command + + Initialize the reserved attributes in modify QP command to eliminate + valgrind warnings like: + + ==23549== Syscall param write(buf) points to uninitialised byte(s) + ==23549== at 0x316B1B933F: (within /lib64/tls/libc-2.3.4.so) + ==23549== by 0x4A33AF7: ibv_cmd_modify_qp (cmd.c:782) + ==23549== by 0x4F860D8: mlx4_modify_qp (verbs.c:480) + ==23549== by 0x4A37A53: ibv_modify_qp@@IBVERBS_1.1 (verbs.c:441) + ==23549== by 0x40972E: qp_reset_to_rtr (mr_test_fun.c:1189) + ==23549== by 0x403AFC: mr_test_connect_qp (mr_test.c:232) + ==23549== by 0x404956: do_test (mr_test.c:85) + ==23549== by 0x402DF8: main (main.c:448) + ==23549== Address 0x7FEFFF2AE is on thread 1's stack + + +Roland Dreier 2007-07-10 + + Fix too-big madvise() call in ibv_madvise_range() + + When the first memory range found in ibv_madvise_range() is merged + with the previous range before entering the loop that calls madvise(), + a too-big range could be passed to madvise(). This could lead to + trying to madvise() memory that has already been freed and unmapped, + which causes madvise() and therefore ibv_reg_mr() to fail. + + Fix this by making sure we don't madvise() any memory outside the + range passed into ibv_madvise_range(). + + This fixes . + + +Roland Dreier 2007-07-03 + + Fix Valgrind annotations so they can actually be built + + The AC_CHECK_HEADER() test for will never result + in HAVE_VALGRIND_MEMCHECK_H being defined, so ibverbs.h will never + include and Valgrind annotations will never actually + get built. Fix this by adding an AC_DEFINE() of HAVE_VALGRIND_MEMCHECK_H + if the header is found. + + Pointed out by Jeff Squyres . + + +Roland Dreier 2007-07-03 + + Clean up NVALGRIND comment in config.h.in + + Update configure.in so that the comment generated by autoheader for + NVALGRIND in config.h.in is a complete sentence to match the style of + the rest of the file. + + +Roland Dreier 2007-06-26 + + Add to + + uses uint64_t, so it needs to include . + + +Roland Dreier 2007-06-21 + + Remove deprecated ${Source-Version} from debian/control + + Replace ${Source-Version} with the more-correct ${binary:Version}. + + +Roland Dreier 2007-06-15 + + Roll libibverbs 1.1.1 release + + +Merge: 1b12912 796e9fa +Roland Dreier 2007-06-03 + + Merge branch 'master' into stable + +Jack Morgenstein 2007-06-03 + + Initialize QP state to RESET + + For newly created QPs, set qp->state to IBV_QPS_RESET. At least + libmlx4 needs this fix, or else it won't correctly initialize the QP's + send queue when transitioning to INIT. + + +Merge: 3008108 2ba2697 +Roland Dreier 2007-05-31 + + Merge branch 'master' into stable + +Michael S. Tsirkin 2007-05-30 + + Don't warn root if RLIMIT_MEMLOCK is low + + The amount of memory root can lock isn't limited, so the rlimit value + doesn't matter in this case. Do not print a warning about + RLIMIT_MEMLOCK being too low if EUID is 0. + + +Dotan Barak 2007-05-06 + + ibv_devinfo: Decode max_vl_num to actual number + + Print max_vl_num as the actual number of VLs in addition to the + encoded value. + + +Roland Dreier 2007-05-17 + + Add wc_wmb() + + Add a write-combining flush operation, wc_wmb(), which can be used by + low-level drivers to force ordering of write-combined stores. + + +Roland Dreier 2007-05-03 + + Fix call to ibv_free_device_list() in pingpong examples + + When a -d option to specify which device to use is passed to the + pingpong examples, they iterate through the device list by + incrementing the dev_list pointer. This means that the call to + ibv_free_device_list() may not get the right pointer. + + Fix this by using an index to iterate through the array and leaving + the dev_list pointer itself alone. + -2006-02-13 Roland Dreier +Roland Dreier 2007-05-03 - * examples/asyncwatch.c, examples/device_list.c, - examples/devinfo.c: Remove cpu_to_be64()/be64_to_cpu() and use - htonll()/ntohll() from . + Trivial whitespace fixes in examples/ + -2006-02-13 Dotan Barak +Roland Dreier 2007-04-28 - * src/cmd.c (ibv_cmd_query_qp, ibv_cmd_query_srq), - include/infiniband/driver.h: Add driver interface for calling - query QP and query SRQ kernel commands. + Update Debian build + + Use DEB_AUTO_UPDATE_LIBTOOL rather than manual rerunning autotools to + avoid setting RPATH. Remove DEB_DH_STRIP_ARGS since cdbs should + handle this automatically at debhelper compat level 5. Let cdbs + generate build-deps automatically (move control to control.in). + - * include/infiniband/kern-abi.h: Add kernel ABI for query QP and - query SRQ. +Merge: cbde955 f6f8070 +Roland Dreier 2007-04-12 - * src/verbs.c (ibv_query_qp, ibv_query_srq), - include/infiniband/verbs.h: Add query QP and query SRQ library - APIs. This changes the provider ABI, since new fields are added - to struct ibv_context_ops; source compatibility with provider - libraries is preserved, but binaries will have to be recompiled. - Neither source nor binary compatibility with consumers of - libibverbs is affected. + Merge branches 'stable' and 'stable-1.0' into stable -2006-02-01 Roland Dreier +Roland Dreier 2007-04-11 - * examples/rc_pingpong.c, examples/uc_pingpong.c, - examples/ud_pingpong.c, examples/srq_pingpong.c: Fix bug in - searching for device by name when there's more than one device. + Roll libibverbs 1.1 release + -2006-01-31 Roland Dreier +Roland Dreier 2007-04-11 - * include/infiniband/verbs.h, include/infiniband/driver.h: Remove - useless "extern" from function declarations. + Change a few references from OpenIB to OpenFabrics + -2006-01-26 Roland Dreier +Roland Dreier 2007-04-06 - * include/infiniband/driver.h, src/cmd.c (ibv_cmd_resize_cq): Add - driver interface for calling resize CQ kernel command. + Fix ibv_srq_pingpong option handling + + Add missing break statement in command line option handling switch. + Without the break statement, setting the 'm' (mtu) command line option + fell through to the next case and the same value was used for the 'q' + (num-qp) option. + + (cherry picked from commit 09335ff6377acf90d5ed1c28a6ce69d2aa620e09) - * include/infiniband/kern-abi.h: Add resize CQ kernel ABI. +Roland Dreier 2007-04-06 - * include/infiniband/verbs.h, src/verbs.c (ibv_resize_cq): Add - resize CQ library API. This changes the provider ABI, since a new - field is added to struct ibv_context_ops; source compatibility - with provider libraries is preserved, but binaries will have to be - recompiled. Neither source nor binary compatibility with - consumers of libibverbs is affected. + Fix ibv_srq_pingpong option handling + + Add missing break statement in command line option handling switch. + Without the break statement, setting the 'm' (mtu) command line option + fell through to the next case and the same value was used for the 'q' + (num-qp) option. + + +Roland Dreier 2007-04-06 + + Add missing newline to rlimit(MEMLOCK) warning + + +Roland Dreier 2007-04-04 + + Roll libibverbs 1.1-rc2 release + + +Roland Dreier 2007-04-04 + + Clean up spec file + + Modernize spec file to better match Fedora guidelines: + - remove unused %ver macro + - fix Release to avoid '-' character + - switch to newer recommendation for BuildRoot + - add Requires(post): /sbin/ldconfig and Requires(postun): /sbin/ldconfig + - split static libraries into new -devel-static package + - don't use %makeinstall + + Based on a patch from Doug Ledford . + + +Roland Dreier 2007-03-29 + + Print warning if memlock limit is low + + Check RLIMIT_MEMLOCK, and if it is 32 KB or less, print a warning. + This should help with support requests for systems that set this limit + too low. + + +Roland Dreier 2007-03-29 + + Update README now that 1.1 ABI is (semi-)frozen + + +Roland Dreier 2007-03-28 + + Bump version number to 1.1-rc2-pre1 + + +Roland Dreier 2007-03-27 + + Roll libibverbs 1.1-rc1 release + + +Dotan Barak 2007-03-27 + + Man page updates + + - Fix spelling mistakes + - Convert "PKey"/"QKey" to "P_Key"/"Q_Key" + - Fix variable names in ibv_get_cq_event() example + - Add non-blocking examples for ibv_get_cq_event() and ibv_get_async_event() + + +Roland Dreier 2007-03-27 + + Reference count completion channels + + Keep a reference count in completion channel structure, so that + ibv_destroy_comp_channel() can return EBUSY if a consumer tries to + destroy a channel that still has CQs attached. + + Suggested by Dotan Barak . + + +Dotan Barak 2007-03-19 + + Fix memory leak on ibv_fork_init() error path + + Free test buffer so we don't leak memory when madvise() fails. + + +Roland Dreier 2007-03-08 + + Add low-level driver hooks for reregister MR and memory windows + + Add methods to struct ibv_context_ops to make it possible to implement + reregister memory region and alloc/bind/dealloc memory window + operations in the future without breaking the libibverbs ABI. + + Assuming these methods and data structures are designed properly + (which is hard to say, absent any real implementation) then it will + only be necessary to add new libibverbs functions to call the hooks, + which will be source and binary compatible with existing applications + and low-level drivers. Applications that want to use the new + functions can easily check for their existence at compile time. + + +Jack Morgenstein 2007-02-22 + + Delete man3 symbolic links before creating them during install + + The following patch removes manpage symbolic links so that they may be + relinked in the install. Otherwise a second install will fail, since + the links already exist. + + Suggested by Michael Tsirkin. + + +Roland Dreier 2007-02-20 + + Add remaining libibverbs manpages + + Add the rest of the manpages for libibverbs functions in section 3. + These manpages were written by Dotan Barak . + + +Roland Dreier 2007-02-19 + + Start adding libibverbs manpages + + Add the first few manpages for libibverbs functions in section 3. + Also, add them into the build and add rules to the Makefile to link + shared manpages together (since eg ibv_get_device_list and + ibv_free_device_list have the same manpage). Update Debian and Fedora + packaging to include section 3 manpages in the development package. + + These manpages were written by Dotan Barak . + + +Roland Dreier 2007-01-31 + + Update Debian changelog + + Remove item about bumping soname, since it's no longer true. Merge in + 1.0.4-1 info from stable branch, since it's been uploaded to Debian + archive already. + + +Roland Dreier 2007-01-31 + + Revert "The ibv_cmd_* create functions need to set context" + + This reverts commit 5eaee85aec025be16e5aeeaac64abc6d8d9c191c. + + This breaks ibv_destroy_ah() with libmthca, and isn't really appropriate + for the stable branch. + + +Roland Dreier 2007-01-31 + + Fix unset context breakage when a low-level driver does kernel bypass + + Commit 8b3d2254 ("The ibv_cmd_* create functions need to set context") + breaks things when a low-level driver does not actually use an + ibv_cmd_* function to create an object, since then the context member + of that object never gets set. For example, libmthca does not nee to + call into the kernel to create an AH, and hence ibv_destroy_ah() will + crash because it tries to call a function pointer from the AH's + context member, which never gets set. + + Fix this by adding back all the setting of context to the main verbs + functions like ibv_create_ah() (but still leave the setting in the + ibv_cmd_* create functions too). This means context gets set twice, + but that doesn't really hurt anything. Once we branch off libibverbs + 1.1 as stable, we can change the signatures of the ibv_cmd_* destroy + functions to take an explicit context parameter, and get rid of + setting context in the ibv_cmd_* create functions. + + +Roland Dreier 2007-01-29 + + Rename Debian package back to libibverbs1 + + Since commit fd448acc ("Add ABI compatibility for apps linked against + libibverbs 1.0") makes libibverbs 1.1 binary compatible with + applications linked against libibverbs 1.0 and the soname of the + library remains at 1, change the Debian package back to libibverbs1 + instead of bumping it to libibverbs2. + + +Roland Dreier 2007-01-29 + + Add ABI compatibility for apps linked against libibverbs 1.0 + + Add a compatibility layer that allows applications (but not low-level + drivers) linked against libibverbs 1.0 to work with libibverbs 1.1. + This is done by using Linux's versioned symbol linking support: the + native libibverbs entry points are given IBVERBS_1.1 versions, and + compatibility wrappers for entry points from libibverbs 1.0 are + created with an IBVERBS_1.0 version (to match what libibverbs 1.0 + exported). + + In essense these wrappers create compatible proxies for every + structure returned to the application (struct ibv_device, ibv_context, + ibv_pd, etc), and map between the proxy and the real object when the + application calls into libibverbs. This code is mostly + straightforward, with a few complications in handling async events, + because the pointers in event structures must be translated back to + proxy structures when they are returned to the application. + + There are a few further wrinkles because the calls to data path + functions (poll CQ, post send, etc) are actually inline functions that + call directly into the context ops, so the context ops proxy structure + must actually contain pointers to compatibility wrappers for these + functions as well. This may have some performance impact but it seems + the overhead is unavoidable, and tests with NetPIPE on top of Open MPI + shows that the latency and throughput differences seem to be lost in + the noise anyway. + + +Roland Dreier 2007-01-29 + + The ibv_cmd_* create functions need to set context + + If the ibv_cmd_* create function succeeds, then the object context + pointer must be set by that function so that the corresponding destroy + function will work. This avoids problems in the error cleanup path of + a low-level driver's create function that fails after calling an + ibv_cmd_* create function. + + (cherry picked from commit 8b3d225476c99ea29a68109a7d40e5ef353d4388) + +Steve Wise 2007-01-19 + + The ibv_cmd_* create functions need to set context + + If the ibv_cmd_* create function succeeds, then the object context + pointer must be set by that function so that the corresponding destroy + function will work. This avoids problems in the error cleanup path of + a low-level driver's create function that fails after calling an + ibv_cmd_* create function. + + +Roland Dreier 2007-01-26 + + Fix caching of --version-script check + + The shell code in AC_CACHE_CHECK() should set the variable that we say + is being cached. So set ac_cv_version_script when testing whether ld + accepts --version-script, and then set LIBIBVERBS_VERSION_SCRIPT based + on the (possibly cached) value of ac_cv_version_script outside of the + AC_CACHE_CHECK(). + + +Dotan Barak 2007-01-17 + + Add resource cleanup at end of pingpong tests + + Clean up all IB resources at the end of pingpong examples. Ack CQ + events when using events to all CQ to be destroyed. + + +Roland Dreier 2007-01-18 + + Fix checks of asprintf() return value + + asprintf() returns the number of bytes printed, so the way to check + for failure is to test if the return value is < 0. + + (cherry picked from commit 2d83a4b522b00ab36ada613920e9cf5594648934) + +Michael S. Tsirkin 2007-01-18 + + Fix checks of asprintf() return value + + asprintf() returns the number of bytes printed, so the way to check + for failure is to test if the return value is < 0. + + +Dotan Barak 2007-01-17 + + Check asprintf() return in pingpong examples + + Handle asprintf() allocation failures in pingpong examples. + + (cherry picked from commit db39573253488a800ad3a57bf8846902c090450f) + +Dotan Barak 2007-01-17 + + Check asprintf() return in pingpong examples + + Handle asprintf() allocation failures in pingpong examples. + + +Roland Dreier 2007-01-17 + + Check return of calloc() in ibv_get_device_list() + + Don't blindly copy device pointers if calloc() returns NULL. + + (cherry picked from commit 789728f7be1e6c7f22380ae739d7b692d2c08d0f) + +Roland Dreier 2007-01-17 + + Check return of calloc() in ibv_get_device_list() + + Don't blindly copy device pointers if calloc() returns NULL. + + +Dotan Barak 2007-01-16 + + Fix some memory leaks in read_config() error path + + Don't leak path if stat() fails or file isn't a regular file. + + +Dotan Barak 2007-01-16 + + Handle asprintf memory allocation failures + + Cherry-picked from dd3d43d89e398f23c4824d26f2698446ff2d120f. + + +Dotan Barak 2007-01-16 + + Handle asprintf memory allocation failures + + +Steve Wise 2007-01-11 + + Don't lose devices when multiple RDMA devices are present + + When scanning through /sys/class/infiniband_verbs, link structs into + the list of found devices properly so that older devices aren't lost. + + +Roland Dreier 2007-01-11 + + Don't use d_type member of struct dirent + + On some filesystems (notably reiserfs), dent->d_type is always + DT_UNKNOWN. Therefore libibverbs should use stat() to check file + types when scanning through directories rather than relying on the + d_type returned from readdir(). + + +Roland Dreier 2007-01-11 -2006-01-25 Roland Dreier + Revert "Pass driver data through ibv_cmd_req_notify_cq()" + + This reverts commit d5b9ab3d7009b77ee45e98827e803205d322ce7d, since + the Chelsio cxgb3 driver no longer needs it and the kernel side of + req_notify_cq() doesn't handle user data anyway. + + +Roland Dreier 2006-12-10 + + Implement new method for finding and loading device-specific drivers + + Export an ibv_register_driver() entry point, and expect plugins to + call it from __attribute__((constructor)) code. This will allow + multiple drivers to be statically linked in. + + Also read config files and then use dlopen() with a relative path to + find drivers (rather than searching a specific lib/infiniband/ + directory for .so files). This allows multiple versions of a driver + to be installed in parallel, and also allows for optimized drivers in + places like /lib/i686. + + Drivers should no longer export an ibv_driver_init() function any + more. Instead, they should add a function (which can be static) with + __attribute__((constructor)) that calls ibv_register_driver() to + register the driver's probe function. Also, drivers should install a + file with a line "driver " under ${sysconfdir}/libibverbs.d. + - * examples/pingpong.c, examples/pingpong.h, - examples/rc_pingpong.c, examples/uc_pingpong.c, - examples/srq_pingpong.c: Move pp_get_local_lid() to pingpong.c to - reduce code duplication. +Steve Wise 2006-10-06 -2006-01-22 Roland Dreier + Pass driver data through ibv_cmd_req_notify_cq() + + The Chelsio iWARP driver library needs to pass information to the + kernel device-specific driver for re-arming the CQ. + - * Release version 1.0-rc5. +Roland Dreier 2006-11-16 -2006-01-22 Dotan Barak + Fix rewritten test for linker script support + + Remove extra parameter to AC_SUBST left by mistake. + - * examples/devinfo.c (main): Make ibv_devinfo list all IB devices - by default, rather than the first device only. +Steve Wise 2006-10-05 -2006-01-20 Roland Dreier + Support provider response data in reg_mr command + + Allow kernel register MR command to pass back driver-specific data to + the userspace driver. The code is there to do this in the kernel, but + not in the libibverbs cmd interface. + - * examples/rc_pingpong.c, examples/uc_pingpong.c, - examples/srq_pingpong.c: Add "-m/--mtu=" option to set path MTU. - (Based on a patch from Ralph Campbell ) +Steve Wise 2006-11-16 - * examples/pingpong.c, examples/pingpong.h: Create generic - pingpong files so that we can start factoring out common code from - the pingpong examples. Start with functions to convert MTU to an - IBV enum value. + Add async_event callback function to struct ibv_context_ops + + Add a callback function so low-level driver libraries can get a + callback with each async event retrieved by the user. It allows the + bypass library to do WQ or CQ processing that needs to happen when a + fatal async event happens. This async callback is similar to the + cq_event callback that already exists in libibverbs. + -2006-01-17 Ralph Campbell +Roland Dreier 2006-11-13 - * examples/rc_pingpong.c (main), examples/srq_pingpong.c (main), - examples/uc_pingpong.c (main), examples/ud_pingpong.c (main): Fix - race when using CQ events by arming CQ before allowing remote side - to start sending. + Remove svn keywords + + Now that we're in git, there's no reason to have $Id in files. + + (cherry picked from d7f11c8805800d3644aa83d1d5f9f708bf9bf585 commit) -2006-01-06 Roland Dreier +Roland Dreier 2006-11-13 - * examples/srq_pingpong.c (main): Fix SRQ example to avoid - problems with many QPs and events. Based on a patch from Dotan - Barak (who also found the problem). + Remove svn keywords + + Now that we're in git, there's no reason to have $Id in files. + -2006-01-06 Ralph Campbell +Roland Dreier 2006-11-02 - * examples/rc_pingpong.c (main), examples/srq_pingpong.c (main), - examples/uc_pingpong.c (main), examples/ud_pingpong.c (main): Fix - test of return value of ibv_poll_cq(). + Bump version number + -2006-01-04 Dotan Barak +Roland Dreier 2006-10-31 - * include/infiniband/verbs.h: Fix mask names in description of - ibv_modify_srq. + Add final Debian changelog for libibverbs 1.0.4 + -2006-01-04 Michael S. Tsirkin +Roland Dreier 2006-10-31 - * src/init.c (ibverbs_init): Fix ibverbs_init for multiple adapters. - Noted by Christoph Raisch. + Roll libibverbs 1.0.4 release + -2005-12-15 Roland Dreier +Roland Dreier 2006-10-31 - * include/infiniband/verbs.h: Document that devices must be opened - before calling ibv_free_device_list(). - - * src/verbs.c (ibv_create_srq): Not all provider libraries will - support SRQs, so check if the create_srq method is defined before - calling it. (Based on a patch from Shirley Ma ) + Fix previous sq_draining change so it actually builds + -2005-11-11 Roland Dreier +Jack Morgenstein 2006-10-31 - * examples/asyncwatch.c, examples/rc_pingpong.c, - examples/srq_pingpong.c, examples/uc_pingpong.c, - examples/ud_pingpong.c, examples/device_list.c, - examples/devinfo.c: Update examples to match new API. - - * include/infiniband/verbs.h, src/device.c, src/init.c, - src/ibverbs.h: Change from dlist-based ibv_get_devices() API to - simpler ibv_get_device_list() and ibv_free_device_list() API. + Return sq_draining properly from query_qp + + Return the sq_draining value back to user space for query_qp instead + of the en_sqd_async notify value. This last is valid only for + modify_qp. For query_qp (according to the IB Spec V1.2), the draining + status should returned. + -2005-11-10 Sean Hefty +Roland Dreier 2006-10-30 - * include/infiniband/sa-kern-abi.h: New include file to contain - definitions of SA structures passed between userspace and kernel. + Make device finding work again + + Fix operator precedence issues in how libibverbs-1.0 adds devices to + its list. Also change so that devices are reported in the same order + as they were in older versions. + - * include/infiniband/sa.h: New include file for definitions of - SA structures used by multiple libraries. +Roland Dreier 2006-10-27 - * include/infiniband/marshall.h src/marshall.c: New files to define - routines used to exchange data with kernel modules. + Fix static linking of libibverbs-1.0 + - * include/infiniband/kern-abi.h: Added data structures used to exchange - QP attribute with kernel modules. +Roland Dreier 2006-10-17 -2005-11-09 Michael S. Tsirkin + Add rmb() and wmb() to + + Update i386/x86_64 versions to use "lock; addl $0"/"lfence" instead of + just a compiler barrier, to guard against out-of-order speculative + reads. + - * src/device.c (ibv_get_devices): Make function reentrant by using - a mutex to make sure we initialize the device list at most once. +Jeff Squyres 2006-10-17 -2005-11-08 Roland Dreier + Add README notes about Valgrind memcheck support + - * src/cmd.c (ibv_cmd_create_qp): Add handling for new create QP - interface, which has the kernel return QP capabilities. +Roland Dreier 2006-10-17 - * src/cmd.c (ibv_cmd_modify_srq): Split off handling of modify SRQ - for ABI versions 3 and older, which passed max_sge as part of command. + Add handling of --with-valgrind= + + Change libibverbs/libmthca Valgrind support so that --with-valgrind + can take an installation directory to look in for Valgrind headers. + -2005-10-30 Roland Dreier +Roland Dreier 2006-10-08 - * examples/srq_pingpong.c (pp_init_ctx): Create CQ with rx_depth + - num_qp entries, instead of just rx_depth + 1 entries, because - there can be one send completion pending for each QP. + Update ChangeLogs to give credit for Valgrind annotations + -2005-10-25 Roland Dreier +Roland Dreier 2006-10-05 - * Release version 1.0-rc4. + Fix up configure test for + + Print a warning if Valgrind annotations are requested but + is not found. + - * examples/uc_pingpong.c (pp_connect_ctx): Fix QP attribute masks - used to modify QP to RTR and RTS -- we should not be setting - RDMA/atomic attributes for UC QPs. Now that the mthca kernel - driver bug is fixed, the error is exposed here. +Roland Dreier 2006-10-04 - * examples/rc_pingpong.c, examples/srq_pingpong.c, - examples/uc_pingpong.c, examples/ud_pingpong.c: Keep track of - whether send and/or receive is pending. This avoids failures when - the remote side receives data and posts a send very quickly, and - the local side completes the receive before the previous send. - With the old code, this could result in posting a send before the - previous send completed, and therefore overrun the send queue. + Add Valgrind annotations + + Add basic Valgrind annotations to libibverbs and libmthca (disabled by + default, can be enabled by configuring with --with-valgrind). These + reduce false positive warnings from the Valgrind memcheck module. + + Based on work and suggestions from Rainer Keller and + Jeff Squyres . + -2005-10-23 Roland Dreier +Roland Dreier 2006-09-22 - * src/cmd.c (ibv_cmd_get_context_v2): Correct silly mistake in - computation of size of buffer for old ABI command: we need to use - sizeof *cmd instead of sizeof cmd, since cmd is a pointer. + Update libibverbs man pages so they don't refer to "OpenIB" + -2005-10-21 Roland Dreier +Roland Dreier 2006-09-06 - * src/cmd.c (ibv_cmd_post_send, ibv_cmd_post_recv, - ibv_cmd_post_srq_recv): Correct value that we check write() return - value against so that we check against the size we actually try to - write, instead of just sizeof cmd. + Debian packaging improvements + -2005-10-19 Roland Dreier +Roland Dreier 2006-07-04 - * src/cmd.c (ibv_cmd_req_notify_cq): Correct how we pass - solicited_only flag into the kernel. + Fix libibverbs definition of mb() for sparc + -2005-10-13 Roland Dreier +Roland Dreier 2006-06-01 - * include/infiniband/driver.h, src/cmd.c, src/libibverbs.map: Add - command functions for calling new kernel commands. + Fix ibv_get_device_list() to really NULL-terminate the array + - * include/infiniband/verbs.h: Add qp_type to struct ibv_qp so that - we know when we're posting a send on a UD QP, and add kernel - handle member to struct ibv_ah so we can handle drivers that do - create AH and destroy AH operations in the kernel. - - * include/infiniband/kern-abi.h: Add new command structures for - poll CQ, request notification for CQ, post send, post receive, - post SRQ receive, create AH and destroy AH commands. These will - be used by the PathScale userspace driver. +Roland Dreier 2006-06-01 -2005-10-12 Roland Dreier + Fix minor memory leaks + + The result of asprintf needs to be free when no longer needed. + - * examples/srq_pingpong.c (main): Zero out unused entries in - my_dest array to avoid string overflows when we send to the other - side. +Leonid Arsh 2006-05-31 -2005-10-09 Roland Dreier + Add IBV_EVENT_CLIENT_REREGISTER to libibverbs + - * examples/devinfo.c (print_hca_cap): Only print max_mr_size and - page_size_cap if verbose is set. +Roland Dreier 2006-05-31 -2005-10-05 Roland Dreier + Fix update to Debian policy 3.7.2 + - * src/cmd.c (ibv_cmd_modify_srq): Add function for marshalling - modify SRQ command. +Roland Dreier 2006-11-02 -2005-09-29 Roland Dreier + Rewrite test for linker script to get rid of Makefile conditionals + - * examples/devinfo.c (print_hca_cap): Get rid of formatting of - firmware version in what should be device-independent code. +Roland Dreier 2006-10-31 - * include/infiniband/driver.h, include/infiniband/verbs.h, - src/cmd.c (ibv_cmd_query_device): Change firmware version in - struct ibv_device_attr to be a string formatted by device-specific - library. + Fix previous sq_draining change so it actually builds + -2005-09-25 Roland Dreier +Jack Morgenstein 2006-10-31 - * examples/rc_pingpong.c, examples/srq_pingpong.c, - examples/uc_pingpong.c, examples/ud_pingpong.c: Update to match - new completion channel and CQ creation API. + Return sq_draining properly from query_qp + + Return the sq_draining value back to user space for query_qp instead + of the en_sqd_async notify value. This last is valid only for + modify_qp. For query_qp (according to the IB Spec V1.2), the draining + status should returned. + - * include/infiniband/driver.h, include/infiniband/verbs.h, - src/device.c, src/ibverbs.h, src/verbs.c, src/cmd.c: Add notion of - "completion channel" that allows consumers to dynamically create - and destroy file descriptors for retrieving completion events. - Completion channels are handled natively with kernel ABI version 3 - and simulated with backwards compatibility implementations for ABI - versions 1 and 2. +Roland Dreier 2006-10-31 - * include/infiniband/kern-abi.h: Update to match kernel ABI - version 3. + Minor cleanups + + Remove unused driver structure member, constify a few things, etc. + -2005-09-07 Roland Dreier +Roland Dreier 2006-10-17 - * src/device.c (ibv_get_device_guid): Use htonll() instead of - relying on pointer aliasing (which seems to break for some gcc - versions). + Add rmb() and wmb() to + + Update i386/x86_64 versions to use "lock; addl $0"/"lfence" instead of + just a compiler barrier, to guard against out-of-order speculative + reads. + - * include/infiniband/arch.h: Add htonll() and ntohll() functions. +Jeff Squyres 2006-10-17 -2005-09-06 Roland Dreier + Add README notes about Valgrind memcheck support + - * include/infiniband/kern-abi.h, include/infiniband/verbs.h, - src/cmd.c, src/device.c, src/verbs.c, examples/asyncwatch.c: - Update to handle new kernel ABI for avoiding stale completion - events. This is completely analogous to the previous asynchronous - event change. +Roland Dreier 2006-10-17 -2005-08-31 Roland Dreier + Add handling of --with-valgrind= + + Change libibverbs Valgrind support so that --with-valgrind can take an + installation directory to look in for Valgrind headers. + - * include/infiniband/kern-abi.h, include/infiniband/verbs.h, - src/cmd.c, src/device.c, src/ibverbs.h, src/init.c, src/verbs.c, - examples/asyncwatch.c: Update to handle new kernel ABI for - avoiding stale asynchronous events. When a CQ, QP or SRQ is - destroyed, the kernel reports the number of events it has given to - userspace, and we wait until we've handled the same number of - events. +Roland Dreier 2006-10-08 - This does introduce a library API change: consumers are now - required to call ibv_put_async_event() to release every - asynchronous event that they retrieve via ibv_get_async_event(). + Update ChangeLogs to give credit for Valgrind annotations + -2005-08-30 Roland Dreier +Roland Dreier 2006-10-05 - * man/ibv_asyncwatch.1, man/ibv_devices.1, man/ibv_devinfo.1, - man/ibv_rc_pingpong.1, man/ibv_srq_pingpong.1, - man/ibv_uc_pingpong.1, man/ibv_ud_pingpong.1: Add man pages for - example programs. + Fix up configure test for + + Print a warning if Valgrind annotations are requested but + is not found. + - * examples/devinfo.c: Merge with Dotan Barak's vstat tool. +Roland Dreier 2006-10-04 + + Add Valgrind annotations + + Add basic Valgrind annotations to libibverbs (disabled by default, can + be enabled by configuring with --with-valgrind). These reduce false + positive warnings from the Valgrind memcheck module. + + Based on work and suggestions from Rainer Keller and + Jeff Squyres . + + +Roland Dreier 2006-10-03 + + Add node_type and transport_type members to struct ibv_device + + This helps apps work with both iWARP and IB devices. + + +Roland Dreier 2006-09-22 + + Update libibverbs man pages so they don't refer to "OpenIB" + + +Roland Dreier 2006-09-12 + + Fix alignment of work request structures + + Swap next and wr_id members of struct ibv_send_wr and ibv_recv_wr to + allow wr_id to be naturally aligned on 32-bit architectures without + padding. + + +Roland Dreier 2006-09-06 + + Debian packaging improvements + + +Ralph Campbell 2006-08-23 + + Add response handling to ibv_cmd_resize_cq() + + Add resp and resp_size parameters to ibv_cmd_resize_cq() so that the + low-level driver in the kernel can return device-specific information + from the resize CQ operation. + + +Roland Dreier 2006-08-23 + + Fix formatting of pingpong man pages slightly + + +Roland Dreier 2006-08-09 + + Simplify Debian package version + + Use ~ in Debian package version to get sort order of -pre packages + correct in a better way. + + +Roland Dreier 2006-08-03 + + Make fork() work for verbs consumers + + Add code to libibvers that uses madvise(..., MADV_DONTFORK) to make + fork() work for verbs consumers. + + +Roland Dreier 2006-07-04 + + Fix libibverbs definition of mb() for sparc + + +Sean Hefty 2006-06-16 + + Add some helper functions to simplify using UD QPs + + Add new routines: ibv_init_ah_from_wc() and ibv_create_ah_from_wc() to + simplify UD QP communication. + + Expose ibv_copy_ah_attr_from_kern to retrieve ibv_ah_attr from kernel + for a UD QP. + + +Roland Dreier 2006-06-01 + + Fix ibv_get_device_list() to really NULL-terminate the array + + +Roland Dreier 2006-06-01 + + Fix minor memory leaks + + The result of asprintf needs to be free when no longer needed. + + +Leonid Arsh 2006-05-31 + + Add IBV_EVENT_CLIENT_REREGISTER to libibverbs + + +Roland Dreier 2006-05-31 + + Fix update to Debian policy 3.7.2 + + +Roland Dreier 2006-05-24 + + Branch a libibverbs-1.0 tree for maintenance + + Start 1.1 development in main libibverbs tree: + - Remove libsysfs use + - Remove deprecated symbols + + +Roland Dreier 2006-05-22 + + Add a request for ChangeLog entries to the README + + +Michael S. Tsirkin 2006-05-22 + + Get rid of commas at end of enum lists + + While comma at end of enumerator list is legal since 1999, some tools + (notably gcc versions pre-4.0) seem to default to 1989 mode when + running with -pedantic flag, and warn about this usage. + + Since most of our enums in header files do not have comma at end, its + probably easier to fix the remaining two cases than educate all users + of libibverbs on virtues of C99. + + +Roland Dreier 2006-05-04 + + Bump version number + + +Roland Dreier 2006-05-02 + + Update to debian policy 3.7.0 + + +Roland Dreier 2006-05-02 + + Use correct email address in Debian changelog + + +Roland Dreier 2006-05-02 + + Roll libibverbs 1.0.3 release + + +Roland Dreier 2006-05-01 + + Fix Debian sparc build failure + + Fix build failure with Debian sparc compiler: membar is only supported + under V9 ISA. + + +Roland Dreier 2006-04-12 + + Fix memory leak in ibv_read_sysfs_file() + + +Roland Dreier 2006-04-11 + + Reduce dependency on libsysfs + + Reduce libibverbs dependency on libsysfs by using local functions for + internal sysfs access. libsysfs is still required because of the ABI, + which passes a struct sysfs_class_device * to low-level driver init + functions. + + +Roland Dreier 2006-04-11 + + Deprecate "ib_XXX" names + + Deprecate various "ib_XXX" names (introduced as part of SA and + marshalling code). Preferred versions like "ibv_XXX" are now + available. + + +Hoang-Nam Nguyen 2006-04-11 + + Add ibv_rate_to_mult() and mult_to_ibv_rate() functions + + +Roland Dreier 2006-04-11 + + Add __attribute_const macro for gcc >= 3 + + +Roland Dreier 2006-04-10 + + Minor Debian packaging tweaks + + +Roland Dreier 2006-04-10 + + Make libsysfs dependency more robust in spec file + + Depend on %{_includedir}/sysfs/libsysfs.h instead of sysfs-devel to be + friendlier to building on SuSE distros. + + +Roland Dreier 2006-04-08 + + Revert unreviewed changes made without maintainer approval + + +Bryan O'Sullivan 2006-04-06 + + Update spec file from 1.0 branch + +Roland Dreier 2006-03-28 + + Print a warning if dlopen() of a driver fails + + +Roland Dreier 2006-03-27 + + Change Priority field of Debian packages to "extra" + + +Dotan Barak 2006-03-23 + + Added a print of the event name in string format + + +Roland Dreier 2006-03-22 + + Improve documentation of ibv_req_notify_cq() + + +Roland Dreier 2006-03-17 + + Add include of + + This fixes warnings about implicit declaration of free(). + + +Roland Dreier 2006-03-17 + + Add some missing dependencies in Debian control files + + +Roland Dreier 2006-03-16 + + Expand debian/copyright + + Put more detail about copyright holders and licenses in debian/copyright. + + +Roland Dreier 2006-03-15 + + Build-Depend on debhelper (>= 5) + + Debian packaging sets compat to 5, so packages also need to + Build-Depend on debhelper (>= 5). + + +Roland Dreier 2006-03-15 + + Bump libibverbs version + + +Roland Dreier 2006-03-15 + + Roll libibverbs 1.0.2 release + + Debian packaging fixes only. + + +Roland Dreier 2006-03-15 + + Debian packaging fixes based on feedback from debian-mentors mailing list + + +Roland Dreier 2006-03-14 + + Bump version in svn, and switch to non-native Debian packaging + + +Roland Dreier 2006-03-14 + + Roll libibverbs 1.0.1 release + + +Roland Dreier 2006-03-14 + + Use sysfs_open_attribute() and sysfs_read_attribute() + + Use sysfs_open_attribute() and sysfs_read_attribute() instead of the + deprecated function sysfs_read_attribute_value(), which is no longer + present in libsysfs2 (which is already in Debian and Ubuntu). + + +Roland Dreier 2006-03-14 + + Bump libibverbs version to 1.0.1 + + +Roland Dreier 2006-03-13 + + Roll libibverbs 1.0 release + + +Jack Morgenstein 2006-03-06 + + Add enum ibv_rate + + Add enum ibv_rate to define encoding of static_rate field (based on a + patch from Jack Morgenstein ). + + +Ralph Campbell 2006-03-06 + + Fix memory leak in find_drivers() + + I was browsing through the libibverbs code and found a minor memory + leak. Here is the fix. + + +Roland Dreier 2006-02-24 + + Remove copyright notices without license text + + They bugged some silly people. + + +Dotan Barak 2006-02-23 + + Add support for kernel ABI version 6 + + Add support for kernel ABI 6: take SRQ capacity from kernel response + to create SRQ. + + +Roland Dreier 2006-02-16 + + Bump libibverbs version + + Be optimistic and assume that the next release will be 1.0. + + +Roland Dreier 2006-02-16 + + Roll libibverbs 1.0-rc7 release + + +Roland Dreier 2006-02-16 + + Add support for kernel ABI version 5 + + Add support for kernel ABI 5, which properly aligns struct ibv_create_qp_resp. + + +Roland Dreier 2006-02-15 + + Add response handling to ibv_cmd_create_qp() + + Add resp and resp_size parameters to libibverbs's ibv_cmd_create_qp() + function so that kernel low-level drivers can return private data to + userspace device drivers. + + +Roland Dreier 2006-02-15 + + Bump libibverbs version number to 1.0-rc7 + + +Roland Dreier 2006-02-15 + + Roll libibverbs 1.0-rc6 release + + +Roland Dreier 2006-02-15 + + Add lots more TODO information to the libibverbs README + + +Dotan Barak 2006-02-14 + + Report board_id from ibv_devinfo, if present + + +Roland Dreier 2006-02-14 + + Remove cpu_to_be64/be64_to_cpu + + Remove duplicated cpu_to_be64/be64_to_cpu functions in favor of + htonll/ntohll from . + + +Dotan Barak 2006-02-14 + + libibverbs changes for query QP and query SRQ verbs + + +Roland Dreier 2006-02-01 + + Fix pingpong examples' handling of specifying device by name + + +Roland Dreier 2006-02-01 + + Remove useless "extern" from function declarations + + +Roland Dreier 2006-01-30 + + libibverbs changes to handle resizing CQs + + Essentially just adding API and support for passing the call through + to provider plug-ins. + + +Roland Dreier 2006-01-26 + + Add "extern" to declarations for consistency + + +Roland Dreier 2006-01-26 + + Fix Source: line in spec files to point to new tarball name + + +Roland Dreier 2006-01-25 + + Move pp_get_local_lid() to common file + + Factor out more common pingpong code. + + +Roland Dreier 2006-01-23 + + Bump libibverbs version number to 1.0-rc6 + + +Roland Dreier 2006-01-23 + + Roll libibverbs 1.0-rc5 release + + +Roland Dreier 2006-01-23 + + Update TODO section in README + + +Dotan Barak 2006-01-23 + + List all devices in ibv_devinfo + + Make ibv_devinfo list all IB devices by default, rather than the first + device only. + + +Roland Dreier 2006-01-21 + + Factor out some common code in pingpong examples + + - Create pingpong.c/pingpong.h to hold common code for pingpong examples. + - Add option to set path MTU for connected transport pingpong examples. + + +Ralph Campbell 2006-01-17 + + Arm CQ early enough in pingpong examples + + The example pingpong programs have a race when using events where the + client sends the first packet but the server hasn't yet armed the CQ + by calling ibv_req_notify_cq() thus waiting forever in + ibv_get_cq_event(). The fix is to move the call to + ibv_req_notify_cq() before signaling the client to "start". + + +Roland Dreier 2006-01-07 + + Fix ibv_srq_pingpong bug with many QPs when using CQ events + + Fix SRQ example to avoid problems with many QPs and events. Based on + a patch from Dotan Barak (who also found the problem). + + +Ralph Campbell 2006-01-07 + + Fix test of return value of ibv_poll_cq() in pingpong examples + + +Dotan Barak 2006-01-04 + + Typo fix in the description of ibv_modify_srq() + + +Michael S. Tsirkin 2006-01-04 + + Fix ibverbs_init for multiple adapters + + Noted by Christoph Raisch. + + +Michael S. Tsirkin 2005-12-16 + + struct ibv_send_wr.imm_data is in network byte order + + verbs.h documents ordering for immediate data in completion, but not + in send work request. + + +Roland Dreier 2005-12-15 + + Improve ibv_free_device_list() documentation + + +Shirley Ma 2005-12-15 + + Handle devices that don't support SRQ + + create_srq is not a mandatory device function, therefore in + userspace/libibverbs/src/verbs.c ibv_create_srq should check + create_srq() first before calling it, otherwise the caller will cause + the segmentation fault on device which doesn't support SRQs. + + +Roland Dreier 2005-12-14 + + Change from ibv_get_devices() to ibv_get_device_list() + + Change libibverbs API for listing all known devices from + ibv_get_devices() to ibv_get_device_list(), and update all in-tree + uses of this API. + + +Michael S. Tsirkin 2005-11-29 + + Fix EXTRA_DIST: sa-kern-abi.h path is wrong + + +Roland Dreier 2005-11-13 + + Various trivial picayune libibverbs changes + + +Sean Hefty 2005-11-11 + + Add support for userspace RDMA connection management abstraction (CMA) + + Add common user/kernel data structures and copy routines in libibverbs. + + +Roland Dreier 2005-11-09 + + Add changelog entry for previous checkin + +Michael S. Tsirkin 2005-11-09 + + Make ibv_get_devices reentrant + + +Jack Morgenstein 2005-11-09 + + Handle kernel uverbs ABI version 4 + + Update libibverbs and libmthca to handle new kernel ABI 4, which has + the kernel compute exact capabilities for QPs. + + +Roland Dreier 2005-10-30 + + Fix CQ overrun in SRQ pingpong example + + +Roland Dreier 2005-10-30 + + Bump debian package version to -rc5 as well + +Roland Dreier 2005-10-26 + + Bump libibverbs version number to 1.0-rc5 + + +Roland Dreier 2005-10-26 + + Really roll releases + + Fix version in spec file changelog. + + +Roland Dreier 2005-10-26 + + Roll libibverbs 1.0-rc4 release + + +Roland Dreier 2005-10-25 + + Fix QP attr masks in ibv_uc_pingpong + + Fix QP attribute masks used in ibv_uc_pingpong -- now that mthca + correctly fails if we try to set RDMA/atomic capabilities for UC QPs, + the examples need to be fixed as well. + + +Roland Dreier 2005-10-25 + + Handle out-of-order completions in pingpong examples + + Keep track of whether send and/or receive is pending in libibverbs + pingpong examples. This avoids failures when the remote side receives + data and posts a send very quickly, and the local side completes the + receive before the previous send. With the old code, this could + result in posting a send before the previous send completed, and + therefore overrun the send queue. + + +Roland Dreier 2005-10-25 + + Fix "Source:" line in RPM spec files to be a valid URL + + +Roland Dreier 2005-10-23 + + Fix buffer size computation in ibv_cmd_get_context_v2() + + Correct silly mistake in ibv_cmd_get_context_v2() computation of size + of buffer for old ABI command: we need to use sizeof *cmd instead of + sizeof cmd, since cmd is a pointer. + + +Roland Dreier 2005-10-23 + + Move where we set qp->qp_type from cmd.c to verbs.c + + Just to be really anal. + + +Roland Dreier 2005-10-21 + + Check write() return value against size we tried to write + + In libibverbs post send, post recv and post SRQ recv marshalling code, + correct value that we check write() return value against so that we + check against the size we actually try to write, instead of just + sizeof cmd. + + +Roland Dreier 2005-10-19 + + Correct sense of solicited_only parameter to ibv_cm_req_notify_cq() + + +Roland Dreier 2005-10-17 + + Improve comments for ibv_ack_async_event and ibv_ack_async_events + + Pointed out by Michael S. Tsirkin. + + +Roland Dreier 2005-10-14 + + Add support for new datapath kernel commands + + Add handling for calling into kernel for datapath operations, so that + we can handle the PathScale userspace driver. + + +Roland Dreier 2005-10-12 + + Avoid potential buffer overrun in ibv_srq_pingpong + + Fix overrun in ibv_srq_pingpong (detected by Fedora Core 4 + "FORTIFY_SOURCE"). + + +Roland Dreier 2005-10-11 + + Remove unnecessary Debian postrm script, and simplify postinst script + + +Roland Dreier 2005-10-11 + + Add Debian postinst/postrm scripts to add/remove "rdma" group + + +Roland Dreier 2005-10-11 + + Be a little less verbose if verbose flag is not set + + +Roland Dreier 2005-10-05 + + Bump libibverbs version + + Also finish support for modify SRQ verb. + + +Roland Dreier 2005-09-29 + + Move formatting of device's firmware version into device-specific code + + +Roland Dreier 2005-09-29 + + Minor libibverbs packaging cleanups + + Add dist to release and require sysfsutils-devel to for -devel package. + + +Roland Dreier 2005-09-29 + + Initial attempt at Fedora Extras spec files for libibverbs and libmthca + + +Roland Dreier 2005-09-26 + + Update libibverbs and libmthca to handle uverbs ABI version 3 + + +Roland Dreier 2005-09-16 + + Debian packaging changes suggested by debian-mentors review + + +Roland Dreier 2005-09-13 + + Print components of firmware version in hex + + +Roland Dreier 2005-09-09 + + Make command structure sizes the same on 32-bit and 64-bit + + Add 4-byte reserved members to the new destroy CQ, destroy QP and + destroy SRQ command structures so that they become a multiple of 8 + bytes in size. This fixes the structures so they have the same size + on both 32-bit and 64-bit architectures (which is required so that + 32-bit userspace on a 64-bit kernel works correctly). + + +Roland Dreier 2005-09-07 + + Make sure __BYTE_ORDER is defined + + Fail compilation if __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN. + + +Roland Dreier 2005-09-07 + + Include htonll() and nothll() in + + Move htonll() and ntohll() from libmthca into libibverb's + , and use them in ibv_get_device_guid() to + avoid pointer aliasing (which some versions of gcc miscompile). + + +Roland Dreier 2005-09-07 + + Update libibverbs for stale completion event handling + + +Roland Dreier 2005-08-31 + + Add missing half of change log entry + + +Roland Dreier 2005-08-31 + + Add -dbg packages with debugging symbols to Debian packages + + +Roland Dreier 2005-08-31 + + Update for new kernel ABI (stale event handling) + + Update to handle new kernel ABI for avoiding stale asynchronous + events. When a CQ, QP or SRQ is destroyed, the kernel reports the + number of events it has given to userspace, and we wait until we've + handled the same number of events. + + This does introduce a library API change: consumers are now required + to call ibv_put_async_event() to release every asynchronous event that + they retrieve via ibv_get_async_event(). + + +Roland Dreier 2005-08-31 + + Add man pages for libibverbs example programs + + +Roland Dreier 2005-08-30 + + Fix warnings on platforms where uint64_t != unsigned long long + + +Roland Dreier 2005-08-30 + + Fix up Debian packaging + + - Add bug #s for ITP bugs + - Change @topspin.com addresses to @cisco.com + - Add Debian-specific package minor version + + +Roland Dreier 2005-08-30 + + Merge Dotan Barak's vstat tool into ibv_devinfo + + +Pete Wyckoff 2005-08-26 + + Avoid segv when no IB devices are found + + +Roland Dreier 2005-08-10 + + Add support for SRQs + + Add libibverbs support for SRQs, including ibv_srq_pingpong example. + + +Roland Dreier 2005-08-09 + + Make --rx-depth work in pingpong examples + + Fix option handling in pinpgong examples so that --rx-depth long + option actually works. + + +Michael S. Tsirkin 2005-07-25 + + Lazy initialization of libibverbs on ibv_get_devices + + +Roland Dreier 2005-07-01 + + Add port info dump to devinfo program + + +Roland Dreier 2005-07-01 + + Move ud-pingpong.c to ud_pingpong.c + + Be consistent with uc_pingpong.c and rc_pingpong.c. + + +Roland Dreier 2005-07-01 + + Get ready to receive before other side starts to send + + Have server side of pingpong get ready to receive before client side + starts sending, to close a race condition. + + +Roland Dreier 2005-06-28 + + Free context before closing associated file descriptors + + Free context before closing associated file descriptors. This can + avoid some problems acquiring mmap_sem when there is still memory to + be released by the kernel. + + +Roland Dreier 2005-06-24 + + Always pass 0 for CQ event handler + + Always pass 0 to the kernel for CQ event handler until multiple + handlers are implemented and added to the API. + + +Roland Dreier 2005-06-24 + + Don't follow driver path in user's environment if we're running SUID + + +Roland Dreier 2005-06-23 + + Expand libibverbs README + + +Roland Dreier 2005-06-21 + + Update for kernel ABI changes + + Update userspace verbs libraries to match kernel ABI changes (required + for using "struct ib_udata *" interface). + + +Roland Dreier 2005-06-15 + + Include debian/ directory in distribution tarballs + + +Roland Dreier 2005-06-15 + + Create new ibv_uc_pinpong example + + Create new ibv_uc_pinpong example for using UC transport. Move + original RC pingpong to ibv_rc_pingpong for consistency. + + +Roland Dreier 2005-06-06 + + Write up some basics in libibverbs/README + + +Roland Dreier 2005-06-06 + + Implement userspace side of query_device verb + + Implement query_device verb and add a "devinfo" example. + + +Roland Dreier 2005-05-31 + + Add generic userspace part of query GID and query P_Key verbs + + +Michael S. Tsirkin 2005-05-25 + + Fix uninitialized AH attributes in pingpong examples + + Some address handle attributes (notably static rate flow control) were + uninitialized. Fix this by initializing all fields to 0 using + designated initializers. + + +Roland Dreier 2005-05-14 + + Add userspace side of {attach,detach}_mcast verbs + + +Michael S. Tsirkin 2005-05-09 + + Fix pingpong parameter parsing typos + + Fix for two obvious typos in pingpong and ud-pingpong parameter parsing + (found by Grant in rdma_lat code). + + +Roland Dreier 2005-04-29 + + Update error messages + + Update error messages in libibverbs init so they're easier to + understand, as suggested by Grant Grundler. + +Roland Dreier 2005-04-28 + + Set version number to 0.1.0 + + Also add debian/ directories to build libibverbs. + +Roland Dreier 2005-04-28 + + Add documentation for ibv_poll_cq function + +Roland Dreier 2005-04-20 + + Add command codes for all verbs + + Add command codes for all verbs to userspace include files. Define + parameter structures for query GID, query P_Key, attach multicast and + detach multicast commands. + + +Roland Dreier 2005-04-18 + + Load driver statically linked into executable + + +Roland Dreier 2005-04-07 + + Commit libibverbs code from roland-uverbs branch back onto trunk + +Roland Dreier 2005-01-19 + + Continue implementing verbs + + Add support for opening kernel uverbs file, getting context and event + FDs, and reading async events. + +Roland Dreier 2005-01-13 + + Fix library name passed to AC_INIT() + +Roland Dreier 2005-01-10 + + Fill in more verbs API + + Start filling in more of the verbs API. + + Implement tracking for possibly overlapping locked memory ranges. + +Roland Dreier 2004-12-29 + + Remove unused old Makefile.am files + +Roland Dreier 2004-12-28 + + Complete rename to ib_devices + + Fix Makefile.am to build the right executable. + + +Roland Dreier 2004-12-28 + + Change ib_drivers example to ib_devices + + Rename ib_drivers example program to ib_devices, which is a much more + sensible name. + Index: contrib/ofed/libibverbs/Makefile.am =================================================================== --- contrib/ofed/libibverbs/Makefile.am +++ contrib/ofed/libibverbs/Makefile.am @@ -1,23 +1,33 @@ -INCLUDES = -I$(srcdir)/include +AM_CPPFLAGS = -I$(srcdir)/include lib_LTLIBRARIES = src/libibverbs.la -AM_CFLAGS = -g -Wall -D_GNU_SOURCE +ACLOCAL_AMFLAGS = -I config +AM_CFLAGS = -g -Wall -Werror -src_libibverbs_la_CFLAGS = $(AM_CFLAGS) -DIBV_CONFIG_DIR=\"$(sysconfdir)/libibverbs.d\" +src_libibverbs_la_CFLAGS = $(AM_CFLAGS) -DIBV_CONFIG_DIR=\"$(sysconfdir)/libibverbs.d\" \ + $(LIBNL_CFLAGS) libibverbs_version_script = @LIBIBVERBS_VERSION_SCRIPT@ src_libibverbs_la_SOURCES = src/cmd.c src/compat-1_0.c src/device.c src/init.c \ src/marshall.c src/memory.c src/sysfs.c src/verbs.c \ - src/enum_strs.c -src_libibverbs_la_LDFLAGS = -version-info 1 -export-dynamic \ + src/enum_strs.c src/cmd_exp.c +if ! NO_RESOLVE_NEIGH +src_libibverbs_la_SOURCES += src/neigh.c +noinst_HEADERS = src/neigh.h +endif +src_libibverbs_la_LDFLAGS = -version-info 1 -export-dynamic $(LIBNL_LIBS)\ $(libibverbs_version_script) src_libibverbs_la_DEPENDENCIES = $(srcdir)/src/libibverbs.map bin_PROGRAMS = examples/ibv_devices examples/ibv_devinfo \ examples/ibv_asyncwatch examples/ibv_rc_pingpong examples/ibv_uc_pingpong \ - examples/ibv_ud_pingpong examples/ibv_srq_pingpong + examples/ibv_ud_pingpong examples/ibv_srq_pingpong examples/ibv_shared_mr \ + examples/ibv_xsrq_pingpong examples/ibv_cc_pingpong examples/ibv_task_pingpong \ + examples/ibv_dcini examples/ibv_dctgt examples/ibv_umr examples/ibv_intf \ + examples/ibv_polldcinfo + examples_ibv_devices_SOURCES = examples/device_list.c examples_ibv_devices_LDADD = $(top_builddir)/src/libibverbs.la examples_ibv_devinfo_SOURCES = examples/devinfo.c @@ -30,44 +40,121 @@ examples_ibv_ud_pingpong_LDADD = $(top_builddir)/src/libibverbs.la examples_ibv_srq_pingpong_SOURCES = examples/srq_pingpong.c examples/pingpong.c examples_ibv_srq_pingpong_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_shared_mr_SOURCES = examples/shared_mr.c +examples_ibv_shared_mr_LDADD = $(top_builddir)/src/libibverbs.la examples_ibv_asyncwatch_SOURCES = examples/asyncwatch.c examples_ibv_asyncwatch_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_xsrq_pingpong_SOURCES = examples/xsrq_pingpong.c examples/pingpong.c +examples_ibv_xsrq_pingpong_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_cc_pingpong_SOURCES = examples/cc_pingpong.c +examples_ibv_cc_pingpong_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_task_pingpong_SOURCES = examples/task_pingpong.c +examples_ibv_task_pingpong_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_dcini_SOURCES = examples/dcini.c +examples_ibv_dcini_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_dctgt_SOURCES = examples/dctgt.c examples/pingpong.c examples/dc.h +examples_ibv_dctgt_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_umr_SOURCES = examples/umr_rc.c examples/pingpong.c +examples_ibv_umr_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_intf_SOURCES = examples/intf.c examples/get_clock.c examples/get_clock.h +examples_ibv_intf_LDADD = $(top_builddir)/src/libibverbs.la +examples_ibv_polldcinfo_SOURCES = examples/polldcinfo.c +examples_ibv_polldcinfo_LDADD = $(top_builddir)/src/libibverbs.la + + +# Enable ibverbs library test +# +if TEST_ENABLE + +# Primary ibverbs test +# +TESTS = ibv_test +noinst_PROGRAMS = ibv_test + +ibv_test_CXXFLAGS = -g -Wall -O3 -fno-strict-aliasing +ibv_test_CPPFLAGS = \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/tests \ + -I$(top_srcdir)/tests/cmn +ibv_test_SOURCES = \ + tests/cmn/gtest_cmn.h \ + tests/gtest/gtest.h \ + tests/api/gtest_cd.h \ + tests/gtest_main.cc \ + tests/cmn/gtest_cmn.cc \ + tests/api/gtest_cd_tc1.cc \ + tests/api/gtest_cd_tc2.cc \ + tests/api/gtest_cd_tc3.cc \ + tests/api/gtest_cd_tc4.cc \ + tests/api/gtest_cd_tc5.cc \ + tests/api/gtest_cd_tc6.cc \ + tests/api/gtest_cd_tc7.cc \ + tests/api/gtest_cd_tc8.cc \ + tests/api/gtest_init.cc +ibv_test_LDADD = \ + $(top_builddir)/src/libibverbs.la + +test: ibv_test + rm -f core.* + ./ibv_test + +valgrind: ibv_test + valgrind --tool=memcheck --leak-check=full --track-origins=yes ./ibv_test + +endif libibverbsincludedir = $(includedir)/infiniband libibverbsinclude_HEADERS = include/infiniband/arch.h include/infiniband/driver.h \ include/infiniband/kern-abi.h include/infiniband/opcode.h include/infiniband/verbs.h \ - include/infiniband/sa-kern-abi.h include/infiniband/sa.h include/infiniband/marshall.h + include/infiniband/sa-kern-abi.h include/infiniband/sa.h include/infiniband/marshall.h \ + include/infiniband/ofa_verbs.h include/infiniband/driver_exp.h include/infiniband/kern-abi_exp.h \ + include/infiniband/verbs_exp.h man_MANS = man/ibv_asyncwatch.1 man/ibv_devices.1 man/ibv_devinfo.1 \ man/ibv_rc_pingpong.1 man/ibv_uc_pingpong.1 man/ibv_ud_pingpong.1 \ man/ibv_srq_pingpong.1 man/ibv_alloc_pd.3 man/ibv_attach_mcast.3 \ man/ibv_create_ah.3 man/ibv_create_ah_from_wc.3 \ man/ibv_create_comp_channel.3 man/ibv_create_cq.3 \ - man/ibv_create_qp.3 man/ibv_create_srq.3 \ - man/ibv_create_xrc_rcv_qp.3 man/ibv_event_type_str.3 \ + man/ibv_create_qp.3 man/ibv_create_srq.3 man/ibv_event_type_str.3 \ man/ibv_fork_init.3 man/ibv_get_async_event.3 \ man/ibv_get_cq_event.3 man/ibv_get_device_guid.3 \ man/ibv_get_device_list.3 man/ibv_get_device_name.3 \ - man/ibv_modify_qp.3 man/ibv_modify_srq.3 man/ibv_modify_xrc_rcv_qp.3 \ - man/ibv_open_device.3 man/ibv_open_xrc_domain.3 \ + man/ibv_modify_qp.3 man/ibv_modify_srq.3 man/ibv_open_device.3 \ man/ibv_poll_cq.3 man/ibv_post_recv.3 man/ibv_post_send.3 \ man/ibv_post_srq_recv.3 man/ibv_query_device.3 man/ibv_query_gid.3 \ man/ibv_query_pkey.3 man/ibv_query_port.3 man/ibv_query_qp.3 \ - man/ibv_query_srq.3 man/ibv_query_xrc_rcv_qp.3 \ - man/ibv_rate_to_mult.3 man/ibv_reg_mr.3 man/ibv_reg_xrc_rcv_qp.3 \ - man/ibv_req_notify_cq.3 man/ibv_resize_cq.3 man/verbs.7 - -DEBIAN = debian/changelog debian/compat debian/control debian/copyright \ - debian/ibverbs-utils.install debian/libibverbs1.install \ - debian/libibverbs1.postinst debian/libibverbs-dev.install \ - debian/rules + man/ibv_query_srq.3 man/ibv_rate_to_mult.3 man/ibv_reg_mr.3 \ + man/ibv_req_notify_cq.3 man/ibv_resize_cq.3 man/ibv_rate_to_mbps.3 \ + man/verbs.7 man/ibv_exp_reg_shared_mr.3 man/ibv_shared_mr.1 \ + man/ibv_exp_post_task.3 man/ibv_exp_modify_cq.3 \ + man/ibv_cc_pingpong.1 man/ibv_task_pingpong.1 man/ibv_create_qp_ex.3 \ + man/ibv_create_srq_ex.3 man/ibv_open_xrcd.3 man/ibv_get_srq_num.3 \ + man/ibv_open_qp.3 man/ibv_create_flow.3 \ + man/ibv_exp_create_cq.3 man/ibv_exp_modify_qp.3 \ + man/ibv_exp_poll_cq.3 man/ibv_exp_query_device.3 \ + man/ibv_exp_get_provider_func.3 man/ibv_exp_reg_mr.3 \ + man/ibv_exp_bind_mw.3 man/ibv_exp_create_qp.3 \ + man/ibv_exp_post_send.3 man/ibv_exp_create_dct.3 \ + man/ibv_exp_prefetch_mr.3 man/ibv_exp_alloc_mkey_list_memory.3 \ + man/ibv_exp_create_mr.3 man/ibv_exp_dealloc_mkey_list_memory.3 \ + man/ibv_exp_query_mkey.3 man/ibv_exp_query_dct.3 \ + man/ibv_exp_rereg_mr.3 man/ibv_exp_query_values.3 \ + man/ibv_alloc_mw.3 man/ibv_intf.1 man/ibv_exp_create_res_domain.3 \ + man/ibv_exp_query_intf.3 man/ibv_exp_create_wq.3 \ + man/ibv_exp_modify_wq.3 man/ibv_exp_create_rwq_ind_table.3 \ + man/ibv_exp_query_gid_attr.3 EXTRA_DIST = include/infiniband/driver.h include/infiniband/kern-abi.h \ include/infiniband/opcode.h include/infiniband/verbs.h include/infiniband/marshall.h \ include/infiniband/sa-kern-abi.h include/infiniband/sa.h \ - src/ibverbs.h examples/pingpong.h \ + src/ibverbs.h \ + tests/gtest/gtest-all.cc \ + examples/pingpong.h examples/cc_pingpong.h \ src/libibverbs.map libibverbs.spec.in $(man_MANS) +EXTRA_DIST += debian +EXTRA_DIST += autogen.sh + dist-hook: libibverbs.spec cp libibverbs.spec $(distdir) @@ -77,8 +164,6 @@ $(RM) ibv_ack_async_event.3 && \ $(RM) ibv_ack_cq_events.3 && \ $(RM) ibv_close_device.3 && \ - $(RM) ibv_close_xrc_domain.3 && \ - $(RM) ibv_create_xrc_srq.3 && \ $(RM) ibv_dealloc_pd.3 && \ $(RM) ibv_dereg_mr.3 && \ $(RM) ibv_destroy_ah.3 && \ @@ -89,15 +174,18 @@ $(RM) ibv_detach_mcast.3 && \ $(RM) ibv_free_device_list.3 && \ $(RM) ibv_init_ah_from_wc.3 && \ - $(RM) ibv_unreg_xrc_rcv_qp.3 && \ - $(RM) mult_to_ibv_rate.3 && \ $(RM) ibv_node_type_str.3 && \ $(RM) ibv_port_state_str.3 && \ + $(RM) ibv_close_xrcd.3 && \ + $(RM) ibv_destroy_flow.3 && \ + $(RM) ibv_dealloc_mw.3 && \ + $(RM) ibv_exp_destroy_res_domain.3 && \ + $(RM) ibv_exp_release_intf.3 && \ + $(RM) ibv_exp_destroy_wq.3 && \ + $(RM) ibv_exp_destroy_rwq_ind_table.3 && \ $(LN_S) ibv_get_async_event.3 ibv_ack_async_event.3 && \ $(LN_S) ibv_get_cq_event.3 ibv_ack_cq_events.3 && \ $(LN_S) ibv_open_device.3 ibv_close_device.3 && \ - $(LN_S) ibv_open_xrc_domain.3 ibv_close_xrc_domain.3 && \ - $(LN_S) ibv_create_srq.3 ibv_create_xrc_srq.3 && \ $(LN_S) ibv_alloc_pd.3 ibv_dealloc_pd.3 && \ $(LN_S) ibv_reg_mr.3 ibv_dereg_mr.3 && \ $(LN_S) ibv_create_ah.3 ibv_destroy_ah.3 && \ @@ -108,7 +196,13 @@ $(LN_S) ibv_attach_mcast.3 ibv_detach_mcast.3 && \ $(LN_S) ibv_get_device_list.3 ibv_free_device_list.3 && \ $(LN_S) ibv_create_ah_from_wc.3 ibv_init_ah_from_wc.3 && \ - $(LN_S) ibv_reg_xrc_rcv_qp.3 ibv_unreg_xrc_rcv_qp.3 && \ - $(LN_S) ibv_rate_to_mult.3 mult_to_ibv_rate.3 && \ $(LN_S) ibv_event_type_str.3 ibv_node_type_str.3 && \ - $(LN_S) ibv_event_type_str.3 ibv_port_state_str.3 + $(LN_S) ibv_event_type_str.3 ibv_port_state_str.3 && \ + $(LN_S) ibv_open_xrcd.3 ibv_close_xrcd.3 && \ + $(LN_S) ibv_create_flow.3 ibv_destroy_flow.3 && \ + $(LN_S) ibv_alloc_mw.3 ibv_dealloc_mw.3 && \ + $(LN_S) ibv_exp_create_res_domain.3 ibv_exp_destroy_res_domain.3 && \ + $(LN_S) ibv_exp_query_intf.3 ibv_exp_release_intf.3 && \ + $(LN_S) ibv_exp_create_wq.3 ibv_exp_destroy_wq.3 && \ + $(LN_S) ibv_exp_create_rwq_ind_table.3 ibv_exp_destroy_rwq_ind_table.3 + Index: contrib/ofed/libibverbs/autogen.sh =================================================================== --- contrib/ofed/libibverbs/autogen.sh +++ contrib/ofed/libibverbs/autogen.sh @@ -1,6 +1,5 @@ -#! /bin/sh +#! /bin/sh -exE -set -x aclocal -I config libtoolize --force --copy autoheader Index: contrib/ofed/libibverbs/configure.ac =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/configure.ac @@ -0,0 +1,127 @@ +dnl Process this file with autoconf to produce a configure script. + +AC_PREREQ(2.57) +AC_INIT(libibverbs, 1.1.8mlnx1, linux-rdma@vger.kernel.org) +AC_CONFIG_SRCDIR([src/ibverbs.h]) +AC_CONFIG_AUX_DIR(config) +AC_CONFIG_MACRO_DIR(config) +AC_CONFIG_HEADER(config.h) +AM_INIT_AUTOMAKE([1.10 foreign tar-ustar silent-rules subdir-objects]) +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) + +dnl Checks for programs +AC_PROG_CC +AC_PROG_CXX +AC_GNU_SOURCE +AC_PROG_LN_S +AC_PROG_LIBTOOL + +LT_INIT + +########################## +# Enable tests +# +AC_ARG_ENABLE( + [test], + [AC_HELP_STRING([--enable-test], + [Build test programs (default=no)])], + [enable_test=$enableval], + [enable_test=no]) +AM_CONDITIONAL(TEST_ENABLE, test x$enable_test = xyes) + +########################## +# Enable support for valgrind +# +AC_ARG_WITH([valgrind], + AC_HELP_STRING([--with-valgrind], + [Enable Valgrind annotations (small runtime overhead, default NO)])) +if test x$with_valgrind = x || test x$with_valgrind = xno; then + want_valgrind=no + AC_DEFINE([NVALGRIND], 1, [Define to 1 to disable Valgrind annotations.]) +else + want_valgrind=yes + if test -d $with_valgrind; then + CPPFLAGS="$CPPFLAGS -I$with_valgrind/include" + fi +fi + +AC_ARG_WITH([resolve-neigh], + AC_HELP_STRING([--with-resolve-neigh], + [Enable neighbour resolution in Ethernet (default YES)])) +have_libnl=no +if test x$with_resolve_neigh = x || test x$with_resolve_neigh = xyes; then + PKG_CHECK_MODULES([LIBNL],[libnl-3.0],[ + have_libnl=yes + AC_DEFINE([HAVE_LIBNL3], [1], [Use libnl-3.0]) + AC_DEFINE([HAVE_LIBNL], [1], [Use libnl]) + PKG_CHECK_MODULES([LIBNL_ROUTE3], [libnl-route-3.0]) + LIBNL_CFLAGS="$LIBNL_CFLAGS $LIBNL_ROUTE3_CFLAGS" + LIBNL_LIBS="$LIBNL_LIBS $LIBNL_ROUTE3_LIBS"], [:] + ); + + PKG_CHECK_MODULES([LIBNL3_BUG], [libnl-3.0 >= 3.2.15 libnl-3.0 < 3.2.22], + [AC_DEFINE([HAVE_LIBNL3_BUG], [1], [Use libnl-3.2.15-21])], [:]) + + if test "$have_libnl" = no; then + PKG_CHECK_MODULES([LIBNL], [libnl-1], [have_libnl=yes + AC_DEFINE([HAVE_LIBNL1], [1], [Use libnl-1]) + AC_DEFINE([HAVE_LIBNL], [1], [Use libnl]) + AC_CHECK_LIB(nl, rtnl_link_vlan_get_id, [], + AC_MSG_ERROR([rtnl_link_vlan_get_id not found. libibverbs requires libnl.])) + ],[ + AC_MSG_ERROR([libibverbs requires libnl.]) + ]) + fi +else + AC_DEFINE([NRESOLVE_NEIGH], 1, [Define to 1 to disable resovle neigh annotations.]) +fi +AM_CONDITIONAL([HAVE_LIBNL], [test "$have_libnl" = "yes"]) +AC_SUBST([LIBNL_CFLAGS]) +AC_SUBST([LIBNL_LIBS]) +AM_CONDITIONAL(NO_RESOLVE_NEIGH, test x$with_resolve_neigh = xno) + +dnl Checks for libraries +AC_CHECK_LIB(dl, dlsym, [], + AC_MSG_ERROR([dlsym() not found. libibverbs requires libdl.])) +AC_CHECK_LIB(pthread, pthread_mutex_init, [], + AC_MSG_ERROR([pthread_mutex_init() not found. libibverbs requires libpthread.])) + +dnl Checks for header files. +AC_HEADER_STDC + +if test x$want_valgrind = xyes; then +AC_CHECK_HEADER(valgrind/memcheck.h, + [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1, + [Define to 1 if you have the header file.])], + [if test $want_valgrind = yes; then + AC_MSG_ERROR([Valgrind memcheck support requested, but not found.]) + fi]) +fi + +dnl Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST + +AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script, + [if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then + ac_cv_version_script=yes + else + ac_cv_version_script=no + fi]) + +if test $ac_cv_version_script = yes; then + LIBIBVERBS_VERSION_SCRIPT='-Wl,--version-script=$(srcdir)/src/libibverbs.map' +else + LIBIBVERBS_VERSION_SCRIPT= +fi +AC_SUBST(LIBIBVERBS_VERSION_SCRIPT) + +AC_CACHE_CHECK(for .symver assembler support, ac_cv_asm_symver_support, + [AC_TRY_COMPILE(, [asm("symbol:\n.symver symbol, api@ABI\n");], + ac_cv_asm_symver_support=yes, + ac_cv_asm_symver_support=no)]) +if test $ac_cv_asm_symver_support = yes; then + AC_DEFINE([HAVE_SYMVER_SUPPORT], 1, [assembler has .symver support]) +fi + +AC_CONFIG_FILES([Makefile libibverbs.spec]) +AC_OUTPUT Index: contrib/ofed/libibverbs/debian/changelog =================================================================== --- contrib/ofed/libibverbs/debian/changelog +++ /dev/null @@ -1,78 +0,0 @@ -libibverbs (1.1.2-1) unstable; urgency=low - - * New upstream release. - - Fix memory registration failure cause by too-big madvise() - - Fix many Valgrind false positives - - Add functions to convert enum values to strings - * Replace deprecated ${Source-Version} with ${binary:Version} - * Use DEB_DH_MAKESHLIBS_ARGS_ALL to pass appropriate -V option to - dh_makeshlibs, since new symbols were added in libibverbs 1.1.2. - (Closes: #465435) - * Add debian/watch file. - * Update control file to talk about generic RDMA and iWARP, not just - InfiniBand, since libibverbs works with both IB and iWARP. - * Acknowledge NMU (Closes: #442638). - - -- Roland Dreier Fri, 18 Apr 2008 15:08:52 -0700 - -libibverbs (1.1.1-1.1) unstable; urgency=low - - * Non-maintainer upload. - * Re-generated autotools files to fix double build bug, closes: #442638 - * Bumped Standards-Version to 3.7.3, no change needed. - - -- Michael Meskes Mon, 14 Apr 2008 10:07:58 +0000 - -libibverbs (1.1.1-1) unstable; urgency=low - - * New upstream release. - - Initialize state of newly created QPs to RESET (fixes problems - with libmlx4/ConnectX HCAs). - - Don't warn root about RLIMIT_MEMLOCK, since it doesn't matter. - - Fix free() errors in ibv_xx_pingpong examples. - - -- Roland Dreier Fri, 15 Jun 2007 12:49:02 -0700 - -libibverbs (1.1-1) unstable; urgency=low - - * New upstream release. - - Add support for use of fork() in applications. - - Add manual pages documenting API in section 3. - - New method of finding and loading device-specific drivers. - - Add basic support for iWARP devices. - - Provide compatible ABI for applications linked against libibverbs 1.0. - * Update libtool during build to avoid setting RPATH in binaries on amd64. - - -- Roland Dreier Sat, 28 Apr 2007 14:15:29 -0700 - -libibverbs (1.0.4-1) unstable; urgency=low - - * New upstream release. - - Fix static linking so it has a chance of working. - - Fix cut-and-paste error in sparc mb() macro. - - Other miscellaneous fixes. - * Improve package description. - - -- Roland Dreier Tue, 31 Oct 2006 15:04:33 -0800 - -libibverbs (1.0.3-1) unstable; urgency=low - - * Change priority to extra, since libibverbs depends on libsysfs2, which - has priority extra. (Debian policy section 2.5 states that a package - may not depend on another package of lower priority) - * New upstream release: - - For sparc, only generate membar instruction if compiling for V9 - instruction set. (Closes: #365559) - - Reduce (but not yet eliminate) dependency on libsysfs. - - Deprecate some ib_XXX symbol names and introduce ibv_XXX - replacements for internal consistency. - - Other minor fixes. - * Update to Standards-Version: 3.7.2. - - -- Roland Dreier Tue, 2 May 2006 15:33:14 -0700 - -libibverbs (1.0.2-1) unstable; urgency=low - - * Initial Release. (Closes: #325752) - - -- Roland Dreier Wed, 15 Feb 2006 11:21:59 -0700 Index: contrib/ofed/libibverbs/debian/compat =================================================================== --- contrib/ofed/libibverbs/debian/compat +++ /dev/null @@ -1 +0,0 @@ -5 Index: contrib/ofed/libibverbs/debian/control.in =================================================================== --- contrib/ofed/libibverbs/debian/control.in +++ /dev/null @@ -1,80 +0,0 @@ -Source: libibverbs -Priority: extra -Maintainer: Roland Dreier -Build-Depends: @cdbs@, dpkg-dev (>= 1.13.19) -Standards-Version: 3.7.3 -Section: libs -Homepage: http://www.openfabrics.org/ - -Package: libibverbs1 -Section: libs -Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends}, adduser -Description: A library for direct userspace use of RDMA (InfiniBand/iWARP) - libibverbs is a library that allows userspace processes to use RDMA - "verbs" as described in the InfiniBand Architecture Specification and - the RDMA Protocol Verbs Specification. iWARP ethernet NICs support - RDMA over hardware-offloaded TCP/IP, while InfiniBand is a - high-throughput, low-latency networking technology. InfiniBand host - channel adapters (HCAs) and iWARP NICs commonly support direct - hardware access from userspace (kernel bypass), and libibverbs - supports this when available. - . - For this library to be useful, a device-specific plug-in module - should also be installed. - . - This package contains the shared library. - -Package: libibverbs-dev -Section: libdevel -Architecture: any -Depends: ${misc:Depends}, libibverbs1 (= ${binary:Version}) -Description: Development files for the libibverbs library - libibverbs is a library that allows userspace processes to use RDMA - "verbs" as described in the InfiniBand Architecture Specification and - the RDMA Protocol Verbs Specification. iWARP ethernet NICs support - RDMA over hardware-offloaded TCP/IP, while InfiniBand is a - high-throughput, low-latency networking technology. InfiniBand host - channel adapters (HCAs) and iWARP NICs commonly support direct - hardware access from userspace (kernel bypass), and libibverbs - supports this when available. - . - This package is needed to compile programs against libibverbs1. - It contains the header files and static libraries (optionally) - needed for compiling. - -Package: libibverbs1-dbg -Section: libdevel -Priority: extra -Architecture: any -Depends: ${misc:Depends}, libibverbs1 (= ${binary:Version}) -Description: Debugging symbols for the libibverbs library - libibverbs is a library that allows userspace processes to use RDMA - "verbs" as described in the InfiniBand Architecture Specification and - the RDMA Protocol Verbs Specification. iWARP ethernet NICs support - RDMA over hardware-offloaded TCP/IP, while InfiniBand is a - high-throughput, low-latency networking technology. InfiniBand host - channel adapters (HCAs) and iWARP NICs commonly support direct - hardware access from userspace (kernel bypass), and libibverbs - supports this when available. - . - This package contains the debugging symbols associated with - libibverbs1. They will automatically be used by gdb for debugging - libibverbs-related issues. - -Package: ibverbs-utils -Section: net -Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends} -Description: Examples for the libibverbs library - libibverbs is a library that allows userspace processes to use RDMA - "verbs" as described in the InfiniBand Architecture Specification and - the RDMA Protocol Verbs Specification. iWARP ethernet NICs support - RDMA over hardware-offloaded TCP/IP, while InfiniBand is a - high-throughput, low-latency networking technology. InfiniBand host - channel adapters (HCAs) and iWARP NICs commonly support direct - hardware access from userspace (kernel bypass), and libibverbs - supports this when available. - . - This package contains useful libibverbs1 example programs such as - ibv_devinfo, which displays information about InfiniBand devices. Index: contrib/ofed/libibverbs/debian/copyright =================================================================== --- contrib/ofed/libibverbs/debian/copyright +++ /dev/null @@ -1,49 +0,0 @@ -Initial Debianization: -This package was debianized by Roland Dreier on -Mon, 25 Apr 2005 10:21:08 -0700. - -Source: -It was downloaded from the OpenIB web site at - - -Authors: - Roland Dreier - Dotan Barak - Sean Hefty - Michael S. Tsirkin - -Portions are copyrighted by: - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. - * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Voltaire, Inc. All rights reserved. - -libibverbs is licensed under a choice of one of two licenses. You may -choose to be licensed under the terms of the GNU General Public -License (GPL) Version 2, available from the file -/usr/share/common-licenses/GPL-2 on your Debian system, or the -OpenIB.org BSD license below: - - Redistribution and use in source and binary forms, with or - without modification, are permitted provided that the following - conditions are met: - - - Redistributions of source code must retain the above - copyright notice, this list of conditions and the following - disclaimer. - - - Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. Index: contrib/ofed/libibverbs/debian/ibverbs-utils.install =================================================================== --- contrib/ofed/libibverbs/debian/ibverbs-utils.install +++ /dev/null @@ -1,2 +0,0 @@ -usr/bin -usr/share/man/man1 Index: contrib/ofed/libibverbs/debian/libibverbs-dev.install =================================================================== --- contrib/ofed/libibverbs/debian/libibverbs-dev.install +++ /dev/null @@ -1,3 +0,0 @@ -usr/include -usr/lib/libibverbs*.{a,la,so} -usr/share/man/man3 Index: contrib/ofed/libibverbs/debian/libibverbs-dev.links =================================================================== --- contrib/ofed/libibverbs/debian/libibverbs-dev.links +++ /dev/null @@ -1,16 +0,0 @@ -usr/share/man/man3/ibv_get_async_event.3 usr/share/man/man3/ibv_ack_async_event.3 -usr/share/man/man3/ibv_get_cq_event.3 usr/share/man/man3/ibv_ack_cq_events.3 -usr/share/man/man3/ibv_open_device.3 usr/share/man/man3/ibv_close_device.3 -usr/share/man/man3/ibv_alloc_pd.3 usr/share/man/man3/ibv_dealloc_pd.3 -usr/share/man/man3/ibv_reg_mr.3 usr/share/man/man3/ibv_dereg_mr.3 -usr/share/man/man3/ibv_create_ah.3 usr/share/man/man3/ibv_destroy_ah.3 -usr/share/man/man3/ibv_create_comp_channel.3 usr/share/man/man3/ibv_destroy_comp_channel.3 -usr/share/man/man3/ibv_create_cq.3 usr/share/man/man3/ibv_destroy_cq.3 -usr/share/man/man3/ibv_create_qp.3 usr/share/man/man3/ibv_destroy_qp.3 -usr/share/man/man3/ibv_create_srq.3 usr/share/man/man3/ibv_destroy_srq.3 -usr/share/man/man3/ibv_attach_mcast.3 usr/share/man/man3/ibv_detach_mcast.3 -usr/share/man/man3/ibv_get_device_list.3 usr/share/man/man3/ibv_free_device_list.3 -usr/share/man/man3/ibv_create_ah_from_wc.3 usr/share/man/man3/ibv_init_ah_from_wc.3 -usr/share/man/man3/ibv_rate_to_mult.3 usr/share/man/man3/mult_to_ibv_rate.3 -usr/share/man/man3/ibv_event_type_str.3 usr/share/man/man3/ibv_node_type_str.3 -usr/share/man/man3/ibv_event_type_str.3 usr/share/man/man3/ibv_port_state_str.3 Index: contrib/ofed/libibverbs/debian/libibverbs1.install =================================================================== --- contrib/ofed/libibverbs/debian/libibverbs1.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/libibverbs*.so.* Index: contrib/ofed/libibverbs/debian/libibverbs1.postinst =================================================================== --- contrib/ofed/libibverbs/debian/libibverbs1.postinst +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh -# postinst script for libibverbs1 - -set -e - -if [ "$1" != configure ]; then - exit 0 -fi - -getent group rdma > /dev/null 2>&1 || addgroup --system --quiet rdma - -#DEBHELPER# Index: contrib/ofed/libibverbs/debian/rules =================================================================== --- contrib/ofed/libibverbs/debian/rules +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/make -f -# -*- mode: makefile; coding: utf-8 -*- - -DEB_DH_INSTALL_SOURCEDIR := debian/tmp -DEB_AUTO_UPDATE_LIBTOOL := post -DEB_DH_MAKESHLIBS_ARGS_ALL := -V 'libibverbs1 (>= 1.1.2)' - -include /usr/share/cdbs/1/rules/debhelper.mk -include /usr/share/cdbs/1/class/autotools.mk Index: contrib/ofed/libibverbs/debian/watch =================================================================== --- contrib/ofed/libibverbs/debian/watch +++ /dev/null @@ -1,3 +0,0 @@ -version=3 -opts="uversionmangle=s/-rc/~rc/" \ - http://www.openfabrics.org/downloads/verbs/libibverbs-(.+)\.tar\.gz Index: contrib/ofed/libibverbs/examples/asyncwatch.c =================================================================== --- contrib/ofed/libibverbs/examples/asyncwatch.c +++ contrib/ofed/libibverbs/examples/asyncwatch.c @@ -35,6 +35,10 @@ #endif /* HAVE_CONFIG_H */ #include +#include +#include +#include +#include #include @@ -74,35 +78,74 @@ } } +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start an asyncwatch process\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); +} + int main(int argc, char *argv[]) { struct ibv_device **dev_list; struct ibv_context *context; struct ibv_async_event event; + char *ib_devname = NULL; + int i = 0; /* Force line-buffering in case stdout is redirected */ setvbuf(stdout, NULL, _IOLBF, 0); + while (1) { + int c; + static struct option long_options[] = { + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "help", .has_arg = 0, .val = 'h' }, + { 0 } + }; + c = getopt_long(argc, argv, "d:h", long_options, NULL); + if (c == -1) + break; + switch (c) { + case 'd': + ib_devname = strdupa(optarg); + break; + case 'h': + /* fall through */ + default: + usage(argv[0]); + return 1; + } + } dev_list = ibv_get_device_list(NULL); if (!dev_list) { perror("Failed to get IB devices list"); return 1; } + if (ib_devname) { + for (; dev_list[i]; ++i) { + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + } + } - if (!*dev_list) { - fprintf(stderr, "No IB devices found\n"); + if (!dev_list[i]) { + fprintf(stderr, "IB device %s not found\n", + ib_devname ? ib_devname : ""); return 1; } - context = ibv_open_device(*dev_list); + context = ibv_open_device(dev_list[i]); if (!context) { fprintf(stderr, "Couldn't get context for %s\n", - ibv_get_device_name(*dev_list)); + ibv_get_device_name(dev_list[i])); return 1; } printf("%s: async event FD %d\n", - ibv_get_device_name(*dev_list), context->async_fd); + ibv_get_device_name(dev_list[i]), context->async_fd); while (1) { if (ibv_get_async_event(context, &event)) Index: contrib/ofed/libibverbs/examples/build/Makefile.inc =================================================================== --- contrib/ofed/libibverbs/examples/build/Makefile.inc +++ contrib/ofed/libibverbs/examples/build/Makefile.inc @@ -1,9 +1,12 @@ BINDIR?= /usr/bin CFLAGS+= \ + -DHAVE_CONFIG_H=1 \ -I../../../../../../sys/ofed/include \ -I../../../../libibverbs/include \ - -I../../../../include + -I../../../../include \ + -I../../../../include/infiniband \ + -I../../../../usr.lib/libibverbs -LDADD+= -libverbs -lmlx4 -lmthca -pthread +LIBADD+= ibverbs mlx4 mlx5 mthca pthread Index: contrib/ofed/libibverbs/examples/cc_pingpong.h =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/cc_pingpong.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2009-2010 Mellanox Technologies. All rights reserved. + */ + +#ifndef IBV_CC_PINGPONG_H +#define IBV_CC_PINGPONG_H + +#include +#include +#include + + +#define FLOAT64 double + +enum pp_wr_data_type { + PP_DATA_TYPE_INT8 = 0, + PP_DATA_TYPE_INT16, + PP_DATA_TYPE_INT32, + PP_DATA_TYPE_INT64, + PP_DATA_TYPE_UINT8, + PP_DATA_TYPE_UINT16, + PP_DATA_TYPE_UINT32, + PP_DATA_TYPE_UINT64, + PP_DATA_TYPE_FLOAT32, + PP_DATA_TYPE_FLOAT64, + PP_DATA_TYPE_FLOAT96, + PP_DATA_TYPE_INVALID /* Keep Last */ +}; + +enum pp_wr_calc_op { + PP_CALC_LXOR = 0, + PP_CALC_BXOR, + PP_CALC_LOR, + PP_CALC_BOR, + PP_CALC_LAND, + PP_CALC_BAND, + PP_CALC_ADD, + PP_CALC_MAX, + PP_CALC_MIN, + PP_CALC_MAXLOC, + PP_CALC_MINLOC, + PP_CALC_PROD, + PP_CALC_INVALID /* Keep Last */ +}; + +static struct { + char size; + const char str[32]; +} pp_wr_data_type_str[] = { + [PP_DATA_TYPE_INT8] = { .size = 1, .str = "INT8" }, + [PP_DATA_TYPE_INT16] = { .size = 2, .str = "INT16"}, + [PP_DATA_TYPE_INT32] = { .size = 4, .str = "INT32"}, + [PP_DATA_TYPE_INT64] = { .size = 8, .str = "INT64"}, + [PP_DATA_TYPE_UINT8] = { .size = 1, .str = "UINT8" }, + [PP_DATA_TYPE_UINT16] = { .size = 2, .str = "UINT16"}, + [PP_DATA_TYPE_UINT32] = { .size = 4, .str = "UINT32"}, + [PP_DATA_TYPE_UINT64] = { .size = 8, .str = "UINT64"}, + [PP_DATA_TYPE_FLOAT32] = { .size = 4, .str = "FLOAT32"}, + [PP_DATA_TYPE_FLOAT64] = { .size = 8, .str = "FLOAT64"}, +}; + +static const char pp_wr_calc_op_str[][32] = { + [PP_CALC_LXOR] = "XOR", + [PP_CALC_BXOR] = "BXOR", + [PP_CALC_LOR] = "LOR", + [PP_CALC_BOR] = "BOR", + [PP_CALC_LAND] = "LAND", + [PP_CALC_BAND] = "BAND", + [PP_CALC_ADD] = "ADD", + [PP_CALC_MAX] = "MAX", + [PP_CALC_MIN] = "MIN", + [PP_CALC_MAXLOC] = "MAXLOC", + [PP_CALC_MINLOC] = "MINLOC", + [PP_CALC_PROD] = "PROD" +}; + +static inline int pp_calc_data_size_to_bytes(enum ibv_exp_calc_data_size data_size) +{ + switch (data_size) { + case IBV_EXP_CALC_DATA_SIZE_64_BIT: return 8; + case IBV_EXP_CALC_DATA_SIZE_NUMBER: /* fall through */ + default: return -1; + } +} + +static inline int pp_query_calc_cap(struct ibv_context *context, + enum ibv_exp_calc_op calc_op, + enum ibv_exp_calc_data_type data_type, + enum ibv_exp_calc_data_size data_size, + int *operands_per_gather, + int *max_num_operands) +{ + /* TODO: check using pp_query_device() should be added */ + + if (operands_per_gather) + *operands_per_gather = 1; + + if (max_num_operands) + *max_num_operands = 2; + + return 0; +} + +static inline void pp_print_data_type(void) +{ + int i; + + for (i = 0; i < PP_DATA_TYPE_INVALID; i++) + printf("\t%s\n", pp_wr_data_type_str[i].str); +} + +static inline const char *pp_data_type_to_str(enum pp_wr_data_type data_type) +{ + if (data_type < sizeof(pp_wr_data_type_str)/sizeof(pp_wr_data_type_str[0])) + return pp_wr_data_type_str[data_type].str; + + return "INVALID DATA TYPE"; + +} + +static inline int pp_data_type_to_size(enum pp_wr_data_type data_type) +{ + if (data_type < sizeof(pp_wr_data_type_str)/sizeof(pp_wr_data_type_str[0])) + return pp_wr_data_type_str[data_type].size; + + return -1; +} + +static inline enum pp_wr_data_type pp_str_to_data_type(const char *data_type_str) +{ + int i; + + for (i = 0; i < sizeof(pp_wr_data_type_str)/sizeof(pp_wr_data_type_str[0]); i++) { + if (!strcmp(data_type_str, pp_wr_data_type_str[i].str)) + return i; + } + + return PP_DATA_TYPE_INVALID; +} + +static inline void pp_print_calc_op(void) +{ + int i; + + for (i = 0; i < PP_CALC_INVALID; i++) + printf("\t%s\n", pp_wr_calc_op_str[i]); +} + +static inline const char *pp_calc_op_to_str(enum pp_wr_calc_op calc_op) +{ + if (calc_op < sizeof(pp_wr_calc_op_str)/sizeof(pp_wr_calc_op_str[0])) + return pp_wr_calc_op_str[calc_op]; + + return "INVALID OPERATION OPCODE"; + +} + +static inline enum pp_wr_calc_op pp_str_to_calc_op(const char *calc_op) +{ + int i; + + for (i = 0; i < sizeof(pp_wr_calc_op_str)/sizeof(pp_wr_calc_op_str[0]); i++) { + if (!strcmp(calc_op, pp_wr_calc_op_str[i])) + return i; + } + + return PP_CALC_INVALID; +} + +static inline void pp_print_dev_calc_ops(struct ibv_context *context) +{ + /* TODO: check using pp_query_device() should be added */ +#if 0 + int i, j, flag, supp; + + for (i = 0; i < PP_CALC_INVALID; i++) { + flag = 0; + + for (j = 0; j < PP_DATA_TYPE_INVALID; j++) { + supp = pp_query_calc_cap(context, i, j, NULL, NULL); + + if (!supp) { + if (!flag) { + printf("\t%s:\n", pp_calc_op_to_str(i)); + flag = 1; + } + + printf("\t\t%s\n", pp_data_type_to_str(j)); + } + } + } +#endif +} + +static inline enum ibv_mtu pp_mtu_to_enum(int mtu) +{ + switch (mtu) { + case 256: return IBV_MTU_256; + case 512: return IBV_MTU_512; + case 1024: return IBV_MTU_1024; + case 2048: return IBV_MTU_2048; + case 4096: return IBV_MTU_4096; + default: return -1; + } +} + +static inline uint16_t pp_get_local_lid(struct ibv_context *context, int port) +{ + struct ibv_port_attr attr; + + if (ibv_query_port(context, port, &attr)) + return 0; + + return attr.lid; +} + +#endif /* IBV_CC_PINGPONG_H */ Index: contrib/ofed/libibverbs/examples/cc_pingpong.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/cc_pingpong.c @@ -0,0 +1,1761 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2009-2010 Mellanox Technologies. All rights reserved. + */ + +#if HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cc_pingpong.h" + + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define LAMBDA (0.00001) + +#define EXEC_INT(calc_op, op1, op2) \ + ((calc_op) == PP_CALC_LXOR ? ((!(op1) && (op2)) || ((op1) && !(op2))) \ + : (calc_op) == PP_CALC_BXOR ? (((op1) ^ (op2))) \ + : (calc_op) == PP_CALC_LOR ? (((op1) || (op2))) \ + : (calc_op) == PP_CALC_BOR ? (((op1) | (op2))) \ + : (calc_op) == PP_CALC_LAND ? (((op1) && (op2))) \ + : (calc_op) == PP_CALC_BAND ? (((op1) & (op2))) \ + : EXEC_FLOAT(calc_op, op1, op2)) + +#define EXEC_FLOAT(calc_op, op1, op2) \ + ((calc_op) == PP_CALC_ADD ? (((op1) + (op2))) \ + : (calc_op) == PP_CALC_MAX ? (MAX((op1), (op2))) \ + : (calc_op) == PP_CALC_MIN ? (MIN((op1), (op2))) \ + : (calc_op) == PP_CALC_MAXLOC ? (MAX((op1), (op2))) \ + : (calc_op) == PP_CALC_MINLOC ? (MIN((op1), (op2))) \ + : 0) + +#define VERIFY_FLOAT(calc_op, data_type, op1, op2, res) \ + ((calc_op) == PP_CALC_ADD ? \ + ((fabs((data_type)EXEC_FLOAT(calc_op, op1, op2) - (res))) < LAMBDA)\ + : (((data_type)EXEC_FLOAT(calc_op, op1, op2)) == (res))) \ + + +#define VERIFY_INT(calc_op, data_type, op1, op2, res) \ + (((data_type)EXEC_INT(calc_op, op1, op2)) == (res)) + + +#define EXEC_VER_FLOAT(verify, calc_op, data_type, op1, op2, res) \ + ((verify) ? \ + (VERIFY_FLOAT(calc_op, data_type, (*(data_type *)op1), \ + (*(data_type *)op2), (*(data_type *)res))) \ + : (data_type)EXEC_FLOAT(calc_op, (*(data_type *)op1), (*(data_type *)op2))) + +#define EXEC_VER_INT(verify, calc_op, data_type, op1, op2, res) \ + ((verify) ? \ + (VERIFY_INT(calc_op, data_type, (*(data_type *)op1), \ + (*(data_type *)op2), (*(data_type *)res))) \ + : (data_type)EXEC_INT(calc_op, (*(data_type *)op1), (*(data_type *)op2))) + + +#define EXEC_VERIFY(calc_data_type, calc_op, verify, op1, op2, res) \ + ((calc_data_type) == PP_DATA_TYPE_INT8 ? \ + EXEC_VER_INT(verify, calc_op, int8_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_INT16 ? \ + EXEC_VER_INT(verify, calc_op, int16_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_INT32 ? \ + EXEC_VER_INT(verify, calc_op, int32_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_INT64 ? \ + EXEC_VER_INT(verify, calc_op, int64_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_UINT8 ? \ + EXEC_VER_INT(verify, calc_op, uint8_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_UINT16 ? \ + EXEC_VER_INT(verify, calc_op, uint16_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_UINT32 ? \ + EXEC_VER_INT(verify, calc_op, uint32_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_UINT64 ? \ + EXEC_VER_INT(verify, calc_op, uint64_t, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_FLOAT32 ? \ + EXEC_VER_FLOAT(verify, calc_op, float, op1, op2, res) \ + : (calc_data_type) == PP_DATA_TYPE_FLOAT64 ? \ + EXEC_VER_FLOAT(verify, calc_op, FLOAT64, op1, op2, res) \ + : 0) + +enum { + PP_RECV_WRID = 1, + PP_SEND_WRID = 2, + PP_CQE_WAIT = 3, +}; + +char *wr_id_str[] = { + [PP_RECV_WRID] = "RECV", + [PP_SEND_WRID] = "SEND", + [PP_CQE_WAIT] = "CQE_WAIT", +}; + +static long page_size; + +struct pingpong_calc_ctx { + enum pp_wr_calc_op init_opcode; + enum pp_wr_data_type init_data_type; + enum ibv_exp_calc_op opcode; + enum ibv_exp_calc_data_type data_type; + enum ibv_exp_calc_data_size data_size; + void *gather_buff; + int gather_list_size; + struct ibv_sge *gather_list; +}; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *cq; + struct ibv_qp *qp; + + struct ibv_qp *mqp; + struct ibv_cq *mcq; + + void *buf; + void *net_buf; + int size; + int rx_depth; + int pending; + uint64_t last_result; + + struct pingpong_calc_ctx calc_op; +}; + +struct pingpong_dest { + int lid; + int qpn; + int psn; +}; + +static int pp_prepare_net_buff(int do_neg, + enum pp_wr_data_type type, + const void *in_buff, void *net_buff, + enum ibv_exp_calc_data_type *out_type, + enum ibv_exp_calc_data_size *out_size) +{ + int to_mult = (do_neg ? -1 : 1); + int rc = 0; + + *out_size = IBV_EXP_CALC_DATA_SIZE_64_BIT; + + switch (type) { + case PP_DATA_TYPE_INT8: + *(uint64_t *)net_buff = *(uint8_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_INT; + break; + + case PP_DATA_TYPE_UINT8: + *(uint64_t *)net_buff = *(uint8_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_UINT; + break; + + case PP_DATA_TYPE_INT16: + *(uint64_t *)net_buff = *(uint16_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_INT; + break; + + case PP_DATA_TYPE_UINT16: + *(uint64_t *)net_buff = *(uint16_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_UINT; + break; + + case PP_DATA_TYPE_INT32: + *(uint64_t *)net_buff = *(uint32_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_INT; + break; + + case PP_DATA_TYPE_UINT32: + *(uint64_t *)net_buff = *(uint32_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_UINT; + break; + + case PP_DATA_TYPE_INT64: + *(uint64_t *)net_buff = *(uint64_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_INT; + break; + + case PP_DATA_TYPE_UINT64: + *(uint64_t *)net_buff = *(uint64_t *)in_buff * to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_UINT; + break; + + case PP_DATA_TYPE_FLOAT32: + *(double *)net_buff = (double)(*(float *)in_buff * (float)to_mult); + *out_type = IBV_EXP_CALC_DATA_TYPE_FLOAT; + break; + + case PP_DATA_TYPE_FLOAT64: + *(double *)net_buff = *(double *)in_buff * (double)to_mult; + *out_type = IBV_EXP_CALC_DATA_TYPE_FLOAT; + break; + + default: + fprintf(stderr, "invalid data type %d\n", type); + rc = EINVAL; + }; + + return rc; +} + +static inline int pp_prepare_host_buff(int do_neg, + enum pp_wr_data_type type, + const void *in_buff, void *host_buff) +{ + union { + uint64_t ll; + double lf; + } tmp_buff; + int to_mult = (do_neg ? -1 : 1); + int rc = 0; + + /* todo - add better support in FLOAT */ + tmp_buff.ll = ntohll(*(uint64_t *)in_buff) * to_mult; + + switch (type) { + case PP_DATA_TYPE_INT8: + case PP_DATA_TYPE_UINT8: + *(uint8_t *)host_buff = (uint8_t)tmp_buff.ll; + break; + + case PP_DATA_TYPE_INT16: + case PP_DATA_TYPE_UINT16: + *(uint16_t *)host_buff = (uint16_t)tmp_buff.ll; + break; + + case PP_DATA_TYPE_INT32: + case PP_DATA_TYPE_UINT32: + *(uint32_t *)host_buff = (uint32_t)tmp_buff.ll; + break; + + case PP_DATA_TYPE_INT64: + case PP_DATA_TYPE_UINT64: + *(uint64_t *)host_buff = (uint64_t)tmp_buff.ll; + break; + + case PP_DATA_TYPE_FLOAT32: + *(float *)host_buff = (float)tmp_buff.lf; + break; + + case PP_DATA_TYPE_FLOAT64: + *(double *)host_buff = (double)tmp_buff.lf; + break; + + default: + fprintf(stderr, "invalid data type %d\n", type); + rc = EINVAL; + }; + + return rc; +} + +struct calc_pack_input { + enum pp_wr_calc_op op; + enum pp_wr_data_type type; + const void *host_buf; + uint64_t id; + enum ibv_exp_calc_op *out_op; + enum ibv_exp_calc_data_type *out_type; + enum ibv_exp_calc_data_size *out_size; + void *net_buf; +}; + +struct calc_unpack_input { + enum pp_wr_calc_op op; + enum pp_wr_data_type type; + const void *net_buf; + uint64_t *id; + void *host_buf; +}; + +/** + * pp_pack_data_for_calc - modify the format of the data read from the source + * buffer so calculation can be done on it. + * + * The function may also modify the operation, to match the modified data. + */ +static int pp_pack_data_for_calc(struct ibv_context *context, + struct calc_pack_input *params) +{ + enum pp_wr_calc_op op; + enum pp_wr_data_type type; + const void *host_buffer; + uint64_t id; + enum ibv_exp_calc_op *out_op; + enum ibv_exp_calc_data_type *out_type; + enum ibv_exp_calc_data_size *out_size; + void *network_buffer; + int do_neg = 0; + int conv_op_to_bin = 0; + + /* input parameters check */ + if (!context || + !params || + !params->host_buf || + !params->net_buf || + !params->out_op || + !params->out_type || + !params->out_size || + params->type == PP_DATA_TYPE_INVALID || + params->op == PP_CALC_INVALID) + return EINVAL; + + /* network buffer must be 16B aligned */ + if ((uintptr_t)(params->net_buf) % 16) { + fprintf(stderr, "network buffer must be 16B aligned\n"); + return EINVAL; + } + + op = params->op; + type = params->type; + host_buffer = params->host_buf; + id = params->id; + out_op = params->out_op; + out_type = params->out_type; + out_size = params->out_size; + network_buffer = params->net_buf; + + *out_op = IBV_EXP_CALC_OP_NUMBER; + *out_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; + *out_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; + + switch (op) { + case PP_CALC_LXOR: + *out_op = IBV_EXP_CALC_OP_BXOR; + conv_op_to_bin = 1; + break; + + case PP_CALC_LOR: + *out_op = IBV_EXP_CALC_OP_BOR; + conv_op_to_bin = 1; + break; + + case PP_CALC_LAND: + *out_op = IBV_EXP_CALC_OP_BAND; + conv_op_to_bin = 1; + break; + + case PP_CALC_MIN: + *out_op = IBV_EXP_CALC_OP_MAXLOC; + do_neg = 1; + break; + + case PP_CALC_BXOR: + *out_op = IBV_EXP_CALC_OP_BXOR; + break; + + case PP_CALC_BOR: + *out_op = IBV_EXP_CALC_OP_BOR; + break; + + case PP_CALC_BAND: + *out_op = IBV_EXP_CALC_OP_BAND; + break; + + case PP_CALC_ADD: + *out_op = IBV_EXP_CALC_OP_ADD; + break; + + case PP_CALC_MAX: + *out_op = IBV_EXP_CALC_OP_MAXLOC; + break; + + case PP_CALC_MAXLOC: + case PP_CALC_MINLOC: + case PP_CALC_PROD: /* Unsupported operation */ + case PP_CALC_INVALID: + default: + fprintf(stderr, "unsupported op %d\n", op); + return EINVAL; + } + + /* convert data from user defined buffer to hardware supported representation */ + if (pp_prepare_net_buff(do_neg, type, host_buffer, network_buffer, out_type, out_size)) + return EINVAL; + + /* logical operations use true/false */ + if (conv_op_to_bin) + *(uint64_t *)network_buffer = !!(*(uint64_t *)network_buffer); + + /* convert to network order supported by hardware */ + *(uint64_t *)network_buffer = htonll(*(uint64_t *)network_buffer); + + /* for MINLOC/MAXLOC - copy the ID to the network buffer */ + if (op == PP_CALC_MINLOC || op == PP_CALC_MAXLOC) + *(uint64_t *)((unsigned char *)network_buffer + 8) = htonll(id); + + return 0; +} + +/** + * pp_unpack_data_from_calc - modify the format of the data read from the + * network to the format in which the host expects it. + */ +static int pp_unpack_data_from_calc(struct ibv_context *context, + struct calc_unpack_input *params) +{ + enum pp_wr_calc_op op; + enum pp_wr_data_type type; + const void *network_buffer; + uint64_t *id; + void *host_buffer; + int do_neg = 0; + + if (!context || + !params || + !params->net_buf || + !params->host_buf || + params->type == PP_DATA_TYPE_INVALID || + params->op == PP_CALC_INVALID) + return EINVAL; + + op = params->op; + type = params->type; + network_buffer = params->net_buf; + id = params->id; + host_buffer = params->host_buf; + + /* Check if it's needed to convert the buffer & operation */ + if ((op == PP_CALC_MIN) || (op == PP_CALC_MINLOC)) + do_neg = 1; + + /* convert data from hardware supported data representation to user defined buffer */ + if (pp_prepare_host_buff(do_neg, type, network_buffer, host_buffer)) + return EINVAL; + + /* for MINLOC/MAXLOC - return ID */ + if (op == PP_CALC_MINLOC || op == PP_CALC_MAXLOC) { + if (id) + *id = ntohll(*(uint64_t *)((unsigned char *)network_buffer + 8)); + else + return EINVAL; + } + + return 0; +} + + +static int pp_connect_ctx(struct pingpong_context *ctx, + struct ibv_qp *qp, + int port, + int my_psn, + enum ibv_mtu mtu, + int sl, + struct pingpong_dest *dest) +{ + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = mtu, + .dest_qp_num = dest->qpn, + .rq_psn = dest->psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = dest->lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + } + }; + + if (ibv_modify_qp(qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = my_psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp(qp, &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + +static struct pingpong_dest *pp_client_exch_dest(const char *servername, + int port, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000"]; + int n; + int sockfd = -1; + struct pingpong_dest *rem_dest = NULL; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return NULL; + } + + sprintf(msg, "%04x:%06x:%06x", my_dest->lid, my_dest->qpn, my_dest->psn); + if (write(sockfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + if (read(sockfd, msg, sizeof msg) != sizeof msg) { + perror("client read"); + fprintf(stderr, "Couldn't read remote address\n"); + goto out; + } + + if(write(sockfd, "done", sizeof("done")) != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + goto out; + } + + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn); + +out: + close(sockfd); + return rem_dest; +} + +static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx, + int ib_port, + enum ibv_mtu mtu, + int port, + int sl, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000"]; + int n; + int sockfd = -1, connfd; + struct pingpong_dest *rem_dest = NULL; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return NULL; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, 0); + close(sockfd); + + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + n = read(connfd, msg, sizeof msg); + if (n != sizeof msg) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", + n, (int) sizeof msg); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn); + + if (pp_connect_ctx(ctx, ctx->qp, ib_port, my_dest->psn, mtu, + sl, rem_dest)) { + fprintf(stderr, "Couldn't connect to remote QP\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + sprintf(msg, "%04x:%06x:%06x", my_dest->lid, my_dest->qpn, + my_dest->psn); + if (write(connfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + /* expecting "done" msg */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + +out: + close(connfd); + return rem_dest; +} + +int pp_parse_calc_to_gather(char *ops_str, + enum pp_wr_calc_op calc_op, + enum pp_wr_data_type data_type, + struct pingpong_calc_ctx *calc_ctx, + struct ibv_context *ibv_ctx, + void *buff, + void *net_buff) +{ + struct calc_pack_input params; + int i, num_operands; + char *__gather_token, *__err_ptr = NULL; + + if (!ops_str) { + fprintf(stderr, "You must choose an operation to perform.\n"); + return -1; + } + + calc_ctx->init_opcode = calc_op; + calc_ctx->init_data_type = data_type; + calc_ctx->opcode = IBV_EXP_CALC_OP_NUMBER; + calc_ctx->data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; + calc_ctx->data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; + + for (i = 0, num_operands = 1; i < strlen(ops_str); i++) { + if (ops_str[i] == ',') + num_operands++; + } + + calc_ctx->gather_list_size = num_operands; + + __gather_token = strtok(ops_str, ","); + if (!__gather_token) + return -1; + + /* Build the gather list, assume one operand per sge. todo: improve for any nr of operands */ + for (i = 0; i < num_operands; i++) { + /* copy the operands to the buffer */ + switch (data_type) { + case PP_DATA_TYPE_INT8: + return -1; + + case PP_DATA_TYPE_INT16: + return -1; + + case PP_DATA_TYPE_INT32: + case PP_DATA_TYPE_UINT32: + *((int32_t *)buff + i*4) = strtol(__gather_token, &__err_ptr, 0); + break; + + case PP_DATA_TYPE_INT64: + case PP_DATA_TYPE_UINT64: + *((int64_t *)buff + i*2) = strtoll(__gather_token, &__err_ptr, 0); + break; + + case PP_DATA_TYPE_FLOAT32: + *((float *)buff + i*4) = strtof(__gather_token, &__err_ptr); + break; + + case PP_DATA_TYPE_FLOAT64: + *((FLOAT64 *)buff + i*2) = strtof(__gather_token, &__err_ptr); + break; + + default: + return -1; + } + + memset(¶ms, 0, sizeof(params)); + params.op = calc_ctx->init_opcode; + params.type = calc_ctx->init_data_type; + params.host_buf = (int64_t *) buff + i * 2; + params.id = 0; + params.out_op = &calc_ctx->opcode; + params.out_type = &calc_ctx->data_type; + params.out_size = &calc_ctx->data_size; + params.net_buf = (uint64_t *) net_buff + i * 2; + + if (pp_pack_data_for_calc(ibv_ctx, ¶ms)) { + fprintf(stderr, "Error in pack\n"); + return -1; + } + __gather_token = strtok(NULL, ","); + if (!__gather_token) + break; + + } + + calc_ctx->gather_buff = net_buff; + + return num_operands; +} + +static int pp_prepare_sg_list(int op_per_gather, + int num_operands, + uint32_t lkey, + struct pingpong_calc_ctx *calc_ctx, + void *buff) +{ + int num_sge, sz; + int i, gather_ix; + struct ibv_sge *gather_list = NULL; + + /* Data size is based on datatype returned from pack + * Note: INT16, INT32, INT64 -> INT64 (sz=8) + */ + sz = -1; + sz = pp_calc_data_size_to_bytes(calc_ctx->data_size); + num_sge = (num_operands / op_per_gather) + ((num_operands % op_per_gather) ? 1 : 0); /* todo - change to ceil. requires -lm */ + + gather_list = calloc(num_sge, sizeof(*gather_list)); + if (!gather_list) { + fprintf(stderr, "Failed to allocate %Zu bytes for gather_list\n", + (num_sge * sizeof(*gather_list))); + return -1; + } + + /* Build the gather list */ + for (i = 0, gather_ix = 0; i < num_operands; i++) { + if (!(i % op_per_gather)) { + gather_list[gather_ix].addr = (uint64_t)(uintptr_t)buff + ((sz + 8) * i); + gather_list[gather_ix].length = (sz + 8) * op_per_gather; + gather_list[gather_ix].lkey = lkey; + + gather_ix++; + } + } + + calc_ctx->gather_list = gather_list; + + return 0; +} + +struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, + int rx_depth, int port, int use_event, + enum pp_wr_calc_op calc_op, + enum pp_wr_data_type calc_data_type, + char *calc_operands_str) +{ + struct pingpong_context *ctx; + int rc; + + ctx = malloc(sizeof *ctx); + if (!ctx) + return NULL; + memset(ctx, 0, sizeof *ctx); + + ctx->size = size; + ctx->rx_depth = rx_depth; + + ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; + ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; + ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; + + ctx->buf = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } + + memset(ctx->buf, 0, size); + + ctx->net_buf = memalign(page_size, size); + if (!ctx->net_buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_buffer; + } + memset(ctx->net_buf, 0, size); + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_net_buf; + } + + if (use_event) { + ctx->channel = ibv_create_comp_channel(ctx->context); + if (!ctx->channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + goto clean_device; + } + } else + ctx->channel = NULL; + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } + + ctx->mr = ibv_reg_mr(ctx->pd, ctx->net_buf, size, IBV_ACCESS_LOCAL_WRITE); + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + goto clean_pd; + } + + if (calc_op != PP_CALC_INVALID) { + int op_per_gather, num_op, max_num_op; + + ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; + ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; + ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; + + num_op = pp_parse_calc_to_gather(calc_operands_str, calc_op, calc_data_type, + &ctx->calc_op, ctx->context, ctx->buf, ctx->net_buf); + if (num_op < 0) { + fprintf(stderr, "-E- failed parsing calc operators\n"); + goto clean_mr; + } + + rc = pp_query_calc_cap(ctx->context, + ctx->calc_op.opcode, + ctx->calc_op.data_type, + ctx->calc_op.data_size, + &op_per_gather, &max_num_op); + if (rc) { + fprintf(stderr, "-E- operation not supported on %s. valid ops are:\n", + ibv_get_device_name(ib_dev)); + + pp_print_dev_calc_ops(ctx->context); + goto clean_mr; + } + + if (pp_prepare_sg_list(op_per_gather, num_op, ctx->mr->lkey, &ctx->calc_op, ctx->net_buf)) { + fprintf(stderr, "-failed to prepare the sg list\n"); + goto clean_mr; + } + } + + ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, + ctx->channel, 0); + if (!ctx->cq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_gather_list; + } + + { + struct ibv_exp_qp_init_attr attr = { + .send_cq = ctx->cq, + .recv_cq = ctx->cq, + .cap = { + .max_send_wr = 16, + .max_recv_wr = rx_depth, + .max_send_sge = 16, + .max_recv_sge = 16 + }, + .qp_type = IBV_QPT_RC, + .pd = ctx->pd + }; + + attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; + attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; + + ctx->qp = ibv_exp_create_qp(ctx->context, &attr); + if (!ctx->qp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_cq; + } + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_qp; + } + + } + + ctx->mcq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, + ctx->channel, 0); + if (!ctx->mcq) { + fprintf(stderr, "Couldn't create CQ for MQP\n"); + goto clean_qp; + } + + { + struct ibv_exp_qp_init_attr mattr = { + .send_cq = ctx->mcq, + .recv_cq = ctx->mcq, + .cap = { + .max_send_wr = 1, + .max_recv_wr = rx_depth, + .max_send_sge = 16, + .max_recv_sge = 16 + }, + .qp_type = IBV_QPT_RC, + .pd = ctx->pd + }; + + mattr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; + mattr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; + + ctx->mqp = ibv_exp_create_qp(ctx->context, &mattr); + if (!ctx->qp) { + fprintf(stderr, "Couldn't create MQP\n"); + goto clean_mcq; + } + } + + { + struct ibv_qp_attr mattr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->mqp, &mattr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify MQP to INIT\n"); + goto clean_mqp; + } + } + + return ctx; + +clean_mqp: + ibv_destroy_qp(ctx->mqp); + +clean_mcq: + ibv_destroy_cq(ctx->mcq); + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_gather_list: + free(ctx->calc_op.gather_list); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_net_buf: + free(ctx->net_buf); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + +int pp_close_ctx(struct pingpong_context *ctx) +{ + if (ibv_destroy_qp(ctx->qp)) { + fprintf(stderr, "Couldn't destroy QP\n"); + return 1; + } + + + if (ibv_destroy_qp(ctx->mqp)) { + fprintf(stderr, "Couldn't destroy MQP\n"); + return 1; + } + + + if (ibv_destroy_cq(ctx->cq)) { + fprintf(stderr, "Couldn't destroy CQ\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->mcq)) { + fprintf(stderr, "Couldn't destroy MCQ\n"); + return 1; + } + + free(ctx->calc_op.gather_list); + + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx->channel) { + if (ibv_destroy_comp_channel(ctx->channel)) { + fprintf(stderr, "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + free(ctx->buf); + free(ctx->net_buf); + free(ctx); + + return 0; +} + +static int pp_post_recv(struct pingpong_context *ctx, int n) +{ + int rc; + + struct ibv_sge list = { + .addr = (uintptr_t) ctx->net_buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = PP_RECV_WRID, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) { + rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); + if (rc) + return rc; + } + + return i; +} + +static int pp_post_send(struct pingpong_context *ctx) +{ + int ret; + + struct ibv_sge list = { + .addr = (uintptr_t) ctx->net_buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_exp_send_wr wr = { + .wr_id = PP_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .exp_opcode = IBV_EXP_WR_SEND, + .exp_send_flags = IBV_EXP_SEND_SIGNALED, + }; + struct ibv_exp_send_wr *bad_wr; + /* If this is a calc operation - set the required params in the wr */ + if (ctx->calc_op.opcode != IBV_EXP_CALC_OP_NUMBER) { + wr.exp_opcode = IBV_EXP_WR_SEND; + wr.exp_send_flags |= IBV_EXP_SEND_WITH_CALC; + wr.sg_list = ctx->calc_op.gather_list; + wr.num_sge = ctx->calc_op.gather_list_size; + + wr.op.calc.calc_op = ctx->calc_op.opcode; + wr.op.calc.data_type = ctx->calc_op.data_type; + wr.op.calc.data_size = ctx->calc_op.data_size; + + } + + ret = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); + + return ret; +} + +int pp_post_ext_wqe(struct pingpong_context *ctx, enum ibv_exp_wr_opcode op) +{ + int ret; + struct ibv_exp_send_wr wr = { + .wr_id = PP_CQE_WAIT, + .sg_list = NULL, + .num_sge = 0, + .exp_opcode = op, + .exp_send_flags = IBV_EXP_SEND_SIGNALED, + }; + struct ibv_exp_send_wr *bad_wr; + + switch (op) { + case IBV_EXP_WR_RECV_ENABLE: + case IBV_EXP_WR_SEND_ENABLE: + + wr.task.wqe_enable.qp = ctx->qp; + wr.task.wqe_enable.wqe_count = 0; + + wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; + + break; + + case IBV_EXP_WR_CQE_WAIT: + wr.task.cqe_wait.cq = ctx->cq; + wr.task.cqe_wait.cq_count = 1; + + wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; + + break; + + default: + fprintf(stderr, "-E- unsupported m_wqe opcode %d\n", op); + return -1; + } + + ret = ibv_exp_post_send(ctx->mqp, &wr, &bad_wr); + + return ret; +} + +int pp_poll_mcq(struct ibv_cq *cq, int num_cqe) +{ + int ne; int i; + struct ibv_wc wc[2]; + + if (num_cqe > 2) { + fprintf(stderr, "-E- max num cqe exceeded\n"); + return -1; + } + + do { + ne = ibv_poll_cq(cq, num_cqe, wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + } while (ne < 1); + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + fprintf(stderr, "Failed %s status %s (%d)\n", + wr_id_str[(int)wc[i].wr_id], + ibv_wc_status_str(wc[i].status), + wc[i].status); + return 1; + } + + if ((int) wc[i].wr_id != PP_CQE_WAIT) { + fprintf(stderr, "invalid wr_id %" PRIx64 "\n", wc[i].wr_id); + return -1; + } + } + + return 0; +} + +static int pp_calc_verify(struct pingpong_context *ctx, + enum pp_wr_data_type calc_data_type, + enum pp_wr_calc_op calc_opcode) +{ + uint64_t *op1 = &(ctx->last_result); + uint64_t *op2 = (uint64_t *)ctx->buf + 2; + uint64_t *res = (uint64_t *)ctx->buf; + + return !EXEC_VERIFY(calc_data_type, calc_opcode, 1, op1, op2, res); +} + +static int pp_update_last_result(struct pingpong_context *ctx, + enum pp_wr_data_type calc_data_type, + enum pp_wr_calc_op calc_opcode) +{ + /* EXEC_VERIFY derefence result parameter */ + uint64_t *dummy; + + uint64_t *op1 = (uint64_t *)ctx->buf; + uint64_t *op2 = (uint64_t *)ctx->buf + 2; + uint64_t res = (uint64_t)EXEC_VERIFY(calc_data_type, calc_opcode, 0, op1, op2, dummy); + + ctx->last_result = res; + return 0; +} + + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s connect to server at \n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port (default 18515)\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -i, --ib-port= use port of IB device (default 1)\n"); + printf(" -s, --size= size of message to exchange (default 4096 minimum 16)\n"); + printf(" -m, --mtu= path MTU (default 1024)\n"); + printf(" -r, --rx-depth= number of receives to post at a time (default 500)\n"); + printf(" -n, --iters= number of exchanges (default 1000)\n"); + printf(" -l, --sl= service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -c, --calc= calc operation\n"); + printf(" -t, --op_type= calc operands type\n"); + printf(" -o, --operands= comma separated list of operands\n"); + printf(" -w, --wait_cq=cqn wait for entries on cq\n"); + printf(" -v, --verbose print verbose information\n"); + printf(" -V, --verify verify calc operations\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev = NULL; + struct pingpong_context *ctx; + struct pingpong_dest my_dest; + struct pingpong_dest *rem_dest = NULL; + struct timeval start, end; + char *ib_devname = NULL; + char *servername = NULL; + int port = 18515; + int ib_port = 1; + int size = 4096; + + enum ibv_mtu mtu = IBV_MTU_1024; + int rx_depth = 500; + int iters = 1000; + int routs; + int num_cq_events = 0; + int sl = 0; + int rcnt, scnt; + int use_event = 0; + int mqe_poll = 0; + int verbose = 0; + int verify = 0; + + struct calc_unpack_input params; + + enum pp_wr_data_type calc_data_type = PP_DATA_TYPE_INVALID; + enum pp_wr_calc_op calc_opcode = PP_CALC_INVALID; + char *calc_operands_str = NULL; + struct ibv_wc wc[2]; + int ne, i, ret = 0; + + srand48(getpid() * time(NULL)); + + page_size = sysconf(_SC_PAGESIZE); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "calc", .has_arg = 1, .val = 'c' }, + { .name = "op_type", .has_arg = 1, .val = 't' }, + { .name = "operands", .has_arg = 1, .val = 'o' }, + { .name = "poll_mqe", .has_arg = 0, .val = 'w' }, + { .name = "verbose", .has_arg = 0, .val = 'v' }, + { .name = "verify", .has_arg = 0, .val = 'V' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:et:c:o:wfvV", long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port < 0 || port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtol(optarg, NULL, 0); + if (size < 16) { + usage(argv[0]); + return 1; + } + break; + + case 'm': + mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (mtu < 0) { + usage(argv[0]); + return 1; + } + break; + + case 'r': + rx_depth = strtol(optarg, NULL, 0); + break; + + case 'n': + iters = strtol(optarg, NULL, 0); + break; + + case 'l': + sl = strtol(optarg, NULL, 0); + break; + + case 'v': + verbose = 1; + break; + + case 'V': + verify = 1; + break; + + case 'e': + ++use_event; + break; + + case 't': + calc_data_type = pp_str_to_data_type(optarg); + if (calc_data_type == PP_DATA_TYPE_INVALID) { + printf("-E- invalid data types. Valid values are:\n"); + pp_print_data_type(); + return 1; + } + break; + + case 'o': + calc_operands_str = strdup(optarg); + break; + + case 'c': + calc_opcode = pp_str_to_calc_op(optarg); + if (calc_opcode == PP_CALC_INVALID) { + printf("-E- invalid data types. Valid values are:\n"); + pp_print_calc_op(); + return 1; + } + break; + + case 'w': + mqe_poll = 1; + break; + + default: + usage(argv[0]); + return 1; + } + } + + memset(¶ms, 0, sizeof(params)); + + /* calc and data type are mandatory */ + if (calc_opcode == PP_CALC_INVALID || calc_data_type == PP_DATA_TYPE_INVALID) { + fprintf(stderr, "Data type and calc operation must be specified\n"); + return 1; + } + + /* Verify that all the parameters required for calc operation were set */ + if (!calc_operands_str) { + fprintf(stderr, "Operands must be set for calc operation\n"); + return 1; + } + + if (optind == argc - 1) + servername = strdupa(argv[optind]); + else if (optind < argc) { + usage(argv[0]); + return 1; + } + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + + if (ib_devname) { + int i; + + for (i = 0; dev_list[i]; ++i) { + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) { + ib_dev = dev_list[i]; + break; + } + } + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } else + ib_dev = *dev_list; + + ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, use_event, + calc_opcode, calc_data_type, calc_operands_str); + if (!ctx) + return 1; + + if (servername) + pp_update_last_result(ctx, calc_data_type, calc_opcode); + else + ctx->last_result = *(uint64_t *)ctx->buf; + + routs = pp_post_recv(ctx, ctx->rx_depth); + if (routs < ctx->rx_depth) { + fprintf(stderr, "Couldn't post receive (%d)\n", routs); + ret = 1; + goto out; + } + + if (use_event) + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + ret = 1; + goto out; + } + + my_dest.lid = pp_get_local_lid(ctx->context, ib_port); + my_dest.qpn = ctx->qp->qp_num; + my_dest.psn = lrand48() & 0xffffff; + if (!my_dest.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + ret = 1; + goto out; + } + + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x : MQPN 0x%06x\n", + my_dest.lid, my_dest.qpn, my_dest.psn, ctx->mqp->qp_num); + + if (servername) + rem_dest = pp_client_exch_dest(servername, port, &my_dest); + else + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, &my_dest); + + if (!rem_dest) { + ret = 1; + goto out; + } + + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x\n", + rem_dest->lid, rem_dest->qpn, rem_dest->psn); + + if (servername) + if (pp_connect_ctx(ctx, ctx->qp, ib_port, my_dest.psn, mtu, sl, rem_dest)) { + ret = 1; + goto out; + } + + if (mqe_poll) { + struct pingpong_dest loop_dest; + + loop_dest.lid = my_dest.lid; + loop_dest.psn = my_dest.psn; + loop_dest.qpn = ctx->mqp->qp_num; + + if (pp_connect_ctx(ctx, ctx->mqp, ib_port, my_dest.psn, mtu, sl, &loop_dest)) { + fprintf(stderr, "failed moving mqp to RTS\n"); + ret = 1; + goto out; + } + } + + ctx->pending = PP_RECV_WRID; + + if (servername) { + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + ret = 1; + goto out; + } + ctx->pending |= PP_SEND_WRID; + + } + + if (gettimeofday(&start, NULL)) { + perror("gettimeofday"); + ret = 1; + goto out; + + } + + + rcnt = scnt = 0; + while (rcnt < iters || scnt < iters) { + if (use_event) { + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + ret = 1; + goto out; + } + + ++num_cq_events; + + if (ev_cq != ctx->cq) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + ret = 1; + goto out; + } + + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + ret = 1; + goto out; + } + } + + if (mqe_poll) { + int ne; + + if (pp_post_ext_wqe(ctx, IBV_EXP_WR_CQE_WAIT)) { + fprintf(stderr, "Failed posting cqe_wait wqe\n"); + ret = -1; + goto out; + } + + ne = pp_poll_mcq(ctx->mcq, 1); + if (ne < 0) { + fprintf(stderr, "poll MCQ failed %d\n", ne); + ret = -1; + goto out; + } + } + + do { + ne = ibv_poll_cq(ctx->cq, 2, wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + ret = 1; + goto out; + } + } while (!use_event && ne < 1); + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + fprintf(stderr, "Failed %s status %s (%d v:%d) for count %d\n", + wr_id_str[(int) wc[i].wr_id], + ibv_wc_status_str(wc[i].status), wc[i].status, wc[i].vendor_err, + (int)(wc[i].wr_id == PP_SEND_WRID ? scnt : routs)); + ret = 1; + goto out; + } + + switch ((int)wc[i].wr_id) { + case PP_SEND_WRID: + ++scnt; + break; + + case PP_RECV_WRID: + params.op = calc_opcode; + params.type = calc_data_type; + params.net_buf = ctx->net_buf; + params.id = NULL; + params.host_buf = ctx->buf; + + if (pp_unpack_data_from_calc(ctx->context, ¶ms)) + fprintf(stderr, "Error in unpack \n"); + + if (verbose) { + + switch (calc_data_type) { + case PP_DATA_TYPE_INT32: + case PP_DATA_TYPE_INT64: + case PP_DATA_TYPE_UINT32: + case PP_DATA_TYPE_UINT64: + printf("incoming data is %" PRIu64 "\n", *(uint64_t *)ctx->buf); + break; + + case PP_DATA_TYPE_FLOAT32: + printf("incoming data is %f\n", *(float *)ctx->buf); + break; + + case PP_DATA_TYPE_FLOAT64: + printf("incoming data is %f\n", *(FLOAT64 *)ctx->buf); + break; + + default: + printf("incoming data is 0%016" PRIu64 "\n", + *(uint64_t *)ctx->buf); + } + } + if (verify) { + if (pp_calc_verify(ctx, calc_data_type, calc_opcode)) { + fprintf(stderr, "Calc verification failed\n"); + ret = 1; + goto out; + } + } + pp_update_last_result(ctx, calc_data_type, calc_opcode); + + if (--routs <= 1) { + routs += pp_post_recv(ctx, ctx->rx_depth - routs); + + if (routs < ctx->rx_depth) { + fprintf(stderr, + "Couldn't post receive (%d)\n", + routs); + ret = 1; + goto out; + } + } + + ++rcnt; + break; + + default: + fprintf(stderr, "Completion for unknown wr_id %d\n", + (int) wc[i].wr_id); + ret = 1; + goto out; + } + + ctx->pending &= ~(int)wc[i].wr_id; + if (scnt < iters && !ctx->pending) { + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + ret = 1; + goto out; + } + ctx->pending = PP_RECV_WRID | PP_SEND_WRID; + } + } /* for (i = 0; i < ne; ++i) */ + } /* while (rcnt < iters || scnt < iters) */ + + if (gettimeofday(&end, NULL)) { + perror("gettimeofday"); + ret = 1; + goto out; + } + + { + float usec = (end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_usec - start.tv_usec); + long long bytes = (long long) size * iters * 2; + + printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", + bytes, usec / 1000000., bytes * 8. / usec); + printf("%d iters in %.2f seconds = %.2f usec/iter\n", + iters, usec / 1000000., usec / iters); + } + + ibv_ack_cq_events(ctx->cq, num_cq_events); +out: + ret = pp_close_ctx(ctx); + + ibv_free_device_list(dev_list); + if (calc_operands_str) + free(calc_operands_str); + + free(rem_dest); + + return ret; +} Index: contrib/ofed/libibverbs/examples/dc.h =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/dc.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __DC_H +#define __DC_H + +#include +#include + +struct pingpong_dest { + int lid; + int rsn; + uint64_t dckey; +}; + +/* DCTN LID DCT KEY */ +#define MSG_FORMAT "000000:0000:0000000000000000" + +static inline int to_ib_mtu(int mtu, enum ibv_mtu *ibmtu) +{ + switch (mtu) { + case 256: + *ibmtu = IBV_MTU_256; + return 0; + case 512: + *ibmtu = IBV_MTU_512; + return 0; + case 1024: + *ibmtu = IBV_MTU_1024; + return 0; + case 2048: + *ibmtu = IBV_MTU_2048; + return 0; + case 4096: + *ibmtu = IBV_MTU_4096; + return 0; + default: + return -1; + } +} + +#endif /* __DC_H */ + Index: contrib/ofed/libibverbs/examples/dcini.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/dcini.c @@ -0,0 +1,563 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dc.h" + +struct dc_ctx { + struct ibv_qp *qp; + struct ibv_cq *cq; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_ah *ah; + struct ibv_context *ctx; + void *addr; + size_t length; + int port; + int lid; + uint64_t remote_dct_key; + uint64_t dct_key; + int local_key_defined; + uint32_t dct_number; + struct ibv_port_attr portinfo; + int ib_port; + enum ibv_mtu mtu; + int sl; + uint16_t gid_index; + int use_gid; + union ibv_gid dgid; +}; + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s connect to server at \n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port (default 18515)\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -i, --ib-port= use port of IB device (default 1)\n"); + printf(" -s, --size= size of message to exchange (default 4096)\n"); + printf(" -n, --iters= number of exchanges (default 1000)\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -c, --contiguous-mr use contiguous mr\n"); + printf(" -k, --dc-key DC transport key\n"); + printf(" -m, --mtu MTU of the DCI\n"); + printf(" -a, --check-nop check NOP opcode\n"); + printf(" -g, --gid-index gid index\n"); + printf(" -r, --dgid remote gid. must be given if -g is used\n"); + printf(" -l, --sl service level\n"); +} + +int send_nop(struct dc_ctx *ctx) +{ + struct ibv_exp_send_wr *bad_wr; + struct ibv_exp_send_wr wr; + struct ibv_exp_wc wc; + int err; + int n; + + memset(&wr, 0, sizeof(wr)); + + wr.num_sge = 0; + wr.exp_opcode = IBV_EXP_WR_NOP; + wr.exp_send_flags = IBV_EXP_SEND_SIGNALED; + + err = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); + if (err) { + fprintf(stderr, "post nop failed\n"); + return err; + } + + do { + n = ibv_exp_poll_cq(ctx->cq, 1, &wc, sizeof(wc)); + if (n < 0) { + fprintf(stderr, "poll CQ failed %d\n", n); + return -1; + } + } while (!n); + + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "completion with error %d\n", wc.status); + return -1; + } + + return 0; +} + +static int to_rts(struct dc_ctx *ctx) +{ + struct ibv_exp_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = ctx->ib_port, + .qp_access_flags = 0, + .dct_key = ctx->dct_key, + }; + + if (ibv_exp_modify_qp(ctx->qp, &attr, + IBV_EXP_QP_STATE | + IBV_EXP_QP_PKEY_INDEX | + IBV_EXP_QP_PORT | + IBV_EXP_QP_DC_KEY)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTR; + attr.max_dest_rd_atomic = 0; + attr.path_mtu = ctx->mtu; + attr.ah_attr.is_global = !!ctx->use_gid; + if (ctx->use_gid) { + attr.ah_attr.grh.sgid_index = ctx->gid_index; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = ctx->dgid; + } + + attr.ah_attr.dlid = ctx->lid; + attr.ah_attr.port_num = ctx->ib_port; + attr.ah_attr.sl = ctx->sl; + attr.dct_key = ctx->dct_key; + + if (ibv_exp_modify_qp(ctx->qp, &attr, IBV_EXP_QP_STATE | + IBV_EXP_QP_PATH_MTU | + IBV_EXP_QP_AV)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.max_rd_atomic = 1; + if (ibv_exp_modify_qp(ctx->qp, &attr, IBV_EXP_QP_STATE | + IBV_EXP_QP_TIMEOUT | + IBV_EXP_QP_RETRY_CNT | + IBV_EXP_QP_RNR_RETRY | + IBV_EXP_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + +int pp_client_exch_dest(struct dc_ctx *ctx, const char *servername, int port) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof(MSG_FORMAT)]; + int n; + int sockfd = -1; + int err; + + if (asprintf(&service, "%d", port) < 0) + return -1; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return -1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return -1; + } + + sprintf(msg, "%06x:%04x:0000000000000000", ctx->qp->qp_num, ctx->portinfo.lid); + if (write(sockfd, msg, sizeof(msg)) != sizeof(msg)) { + fprintf(stderr, "Couldn't send local address\n"); + err = -1; + } + + err = read(sockfd, msg, sizeof(msg)); + if (err != sizeof(msg)) { + perror("client read"); + fprintf(stderr, "Read %d/%zu\n", err, sizeof(msg)); + err = -1; + goto out; + } + + sscanf(msg, "%06x:%04x:%016" SCNx64, &ctx->dct_number, &ctx->lid, &ctx->remote_dct_key); + printf("Remote address: DCTN %06x, LID %04x, DCT key %016" PRIx64 "\n", + ctx->dct_number, ctx->lid, ctx->remote_dct_key); + + if (!ctx->local_key_defined) + ctx->dct_key = ctx->remote_dct_key; + + return 0; + +out: + close(sockfd); + return err; +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + char *ib_devname = NULL; + int port = 18515; + int size = 4096; + int iters = 1000; + int use_event = 0; + int use_contig_mr; + int err; + struct ibv_ah_attr ah_attr; + struct dc_ctx ctx = { + .ib_port = 1, + .mtu = IBV_MTU_2048, + .sl = 0, + }; + struct ibv_exp_send_wr wr; + struct ibv_exp_send_wr *bad_wr; + struct ibv_sge sg_list; + int i; + char *servername = NULL; + enum ibv_mtu mtu; + int check_nop = 0; + int dgid_given = 0; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "contig-mr", .has_arg = 0, .val = 'c' }, + { .name = "dc-key", .has_arg = 1, .val = 'k' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "check-nop", .has_arg = 0, .val = 'a' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "gid-index", .has_arg = 1, .val = 'g' }, + { .name = "dgid", .has_arg = 1, .val = 'r' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:i:s:n:ect:k:m:al:g:r:", + long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port < 0 || port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'k': + ctx.dct_key = strtoull(optarg, NULL, 0); + ctx.local_key_defined = 1; + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ctx.ib_port = strtol(optarg, NULL, 0); + if (ctx.ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtol(optarg, NULL, 0); + break; + + case 'n': + iters = strtol(optarg, NULL, 0); + break; + + case 'e': + ++use_event; + break; + + case 'c': + ++use_contig_mr; + break; + + case 'm': + mtu = strtol(optarg, NULL, 0); + if (to_ib_mtu(mtu, &ctx.mtu)) { + printf("invalid MTU %d\n", mtu); + return 1; + } + break; + + case 'a': + check_nop = 1; + break; + + case 'l': + ctx.sl = strtol(optarg, NULL, 0); + break; + + case 'g': + ctx.gid_index = strtol(optarg, NULL, 0); + ctx.use_gid = 1; + break; + + case 'r': + if (!inet_pton(AF_INET6, optarg, &ctx.dgid)) { + usage(argv[0]); + return 1; + } + dgid_given = 1; + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) { + servername = strdupa(argv[optind]); + if (ctx.use_gid && !dgid_given) { + usage(argv[0]); + return 1; + } + } else if (optind < argc) { + usage(argv[0]); + return 1; + } + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx.ctx = ibv_open_device(ib_dev); + if (!ctx.ctx) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + return 1; + } + + ctx.pd = ibv_alloc_pd(ctx.ctx); + if (!ctx.pd) { + fprintf(stderr, "failed to allocate pd\n"); + return 1; + } + + ctx.length = size; + ctx.addr = malloc(ctx.length); + if (!ctx.addr) { + fprintf(stderr, "failed to allocate memory\n"); + return -1; + } + + if (ibv_query_port(ctx.ctx, ctx.ib_port, &ctx.portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + + ctx.mr = ibv_reg_mr(ctx.pd, ctx.addr, ctx.length, + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + if (!ctx.mr) { + fprintf(stderr, "failed to create mr\n"); + return -1; + } + + ctx.cq = ibv_create_cq(ctx.ctx, 128, NULL, NULL, 0); + if (!ctx.cq) { + fprintf(stderr, "failed to create cq\n"); + return -1; + } + + { + struct ibv_qp_init_attr_ex attr = { + .send_cq = ctx.cq, + .recv_cq = ctx.cq, + .cap = { + .max_send_wr = 100, + .max_send_sge = 1, + }, + .qp_type = IBV_EXP_QPT_DC_INI, + .pd = ctx.pd, + .comp_mask = IBV_QP_INIT_ATTR_PD, + }; + + ctx.qp = ibv_create_qp_ex(ctx.ctx, &attr); + if (!ctx.qp) { + fprintf(stderr, "failed to create qp\n"); + return -1; + } + } + + if (pp_client_exch_dest(&ctx, servername, port)) { + printf("failed to connect to target\n"); + return -1; + } + + printf("local address: LID %04x, QPN %06x, DC_KEY %016" PRIx64 "\n", + ctx.portinfo.lid, ctx.qp->qp_num, ctx.dct_key); + + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.is_global = 0; + ah_attr.dlid = ctx.lid; + ah_attr.sl = ctx.sl; + ah_attr.src_path_bits = 0; + ah_attr.port_num = ctx.ib_port; + if (ctx.use_gid) { + ah_attr.is_global = 1; + ah_attr.grh.hop_limit = 1; + ah_attr.grh.sgid_index = ctx.gid_index; + ah_attr.grh.dgid = ctx.dgid; + } + ctx.ah = ibv_create_ah(ctx.pd, &ah_attr); + if (!ctx.ah) { + fprintf(stderr, "failed to create ah\n"); + return -1; + } + + err = to_rts(&ctx); + if (err) { + fprintf(stderr, "failed to move to rts\n"); + return -1; + } + + + if (check_nop) { + err = send_nop(&ctx); + if (err) { + fprintf(stderr, "nop operation failed\n"); + return err; + } + } + + for (i = 0; i < iters; ++i) { + memset(&wr, 0, sizeof(wr)); + wr.num_sge = 1; + wr.exp_opcode = IBV_EXP_WR_SEND; + wr.exp_send_flags = IBV_EXP_SEND_SIGNALED; + sg_list.addr = (uint64_t)(unsigned long)ctx.addr; + sg_list.length = ctx.length; + sg_list.lkey = ctx.mr->lkey; + wr.sg_list = &sg_list; + wr.dc.ah = ctx.ah; + wr.dc.dct_access_key = ctx.dct_key; + wr.dc.dct_number = ctx.dct_number; + + err = ibv_exp_post_send(ctx.qp, &wr, &bad_wr); + if (err) { + fprintf(stderr, "failed to post send request\n"); + return -1; + } else { + int num; + struct ibv_wc wc; + + do { + num = ibv_poll_cq(ctx.cq, 1, &wc); + if (num < 0) { + fprintf(stderr, "failed to poll cq\n"); + return -1; + } + } while (!num); + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "completion with error %d\n", wc.status); + return -1; + } + } + } + printf("test finished successfully\n"); + + return 0; +} Index: contrib/ofed/libibverbs/examples/dctgt.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/dctgt.c @@ -0,0 +1,606 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dc.h" + +struct dc_ctx { + struct ibv_qp *qp; + struct ibv_cq *cq; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_srq *srq; + struct ibv_context *ctx; + void *addr; + size_t length; + int port; + uint64_t dct_key; + unsigned size; + int ib_port; + enum ibv_mtu mtu; + int rcv_idx; + struct ibv_port_attr portinfo; + struct ibv_exp_dct *dct; + pthread_t thread; + int thread_active; + int inl; + pthread_t poll_thread; +}; + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port (default 18515)\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -i, --ib-port= use port of IB device (default 1)\n"); + printf(" -s, --size= size of message to exchange (default 4096)\n"); + printf(" -n, --iters= number of exchanges (unlimited)\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -c, --contiguous-mr use contiguous mr\n"); + printf(" -k, --dc-key DC transport key\n"); + printf(" -m, --mtu MTU of the DCT\n"); + printf(" -l, --inline Requested inline receive size\n"); +} + +static int post_recv(struct dc_ctx *ctx, int n) +{ + struct ibv_sge list = { + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) { + list.addr = (uintptr_t)ctx->addr + (ctx->size * (ctx->rcv_idx++ % 32)); + if (ibv_post_srq_recv(ctx->srq, &wr, &bad_wr)) + break; + } + + return i; +} + +static struct pingpong_dest *pp_server_exch_dest(struct dc_ctx *ctx, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof(MSG_FORMAT)]; + int n; + int sockfd = -1, connfd; + struct pingpong_dest *rem_dest = NULL; + int err; + + if (asprintf(&service, "%d", ctx->port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), ctx->port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof(n)); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", ctx->port); + return NULL; + } + + err = listen(sockfd, 1); + if (err) + return NULL; + + connfd = accept(sockfd, NULL, 0); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + n = read(connfd, msg, sizeof(msg)); + if (n != sizeof(msg)) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int)sizeof(msg)); + goto out; + } + + rem_dest = malloc(sizeof(*rem_dest)); + if (!rem_dest) + goto out; + + sscanf(msg, "%06x:%04x:%016" SCNx64, &rem_dest->rsn, &rem_dest->lid, &rem_dest->dckey); + printf("Connection from: QPN %06x, LID %04x\n", rem_dest->rsn, rem_dest->lid); + + sprintf(msg, "%06x:%04x:%016" PRIx64, ctx->dct->dct_num, ctx->portinfo.lid, ctx->dct_key); + + if (write(connfd, msg, sizeof(msg)) != sizeof(msg)) { + fprintf(stderr, "Couldn't send local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + +out: + close(connfd); + return rem_dest; +} + +static void *handle_clients(void *arg) +{ + struct dc_ctx *ctx = arg; + struct pingpong_dest my_dest; + struct pingpong_dest *ret; + + while (ctx->thread_active) { + ret = pp_server_exch_dest(ctx, &my_dest); + if (!ret) + exit(EXIT_FAILURE); + } + + return NULL; +} + +static const char *event_name_str(enum ibv_event_type event_type) +{ + switch (event_type) { + case IBV_EVENT_DEVICE_FATAL: + return "IBV_EVENT_DEVICE_FATAL"; + case IBV_EVENT_PORT_ACTIVE: + return "IBV_EVENT_PORT_ACTIVE"; + case IBV_EVENT_PORT_ERR: + return "IBV_EVENT_PORT_ERR"; + case IBV_EVENT_LID_CHANGE: + return "IBV_EVENT_LID_CHANGE"; + case IBV_EVENT_PKEY_CHANGE: + return "IBV_EVENT_PKEY_CHANGE"; + case IBV_EVENT_SM_CHANGE: + return "IBV_EVENT_SM_CHANGE"; + case IBV_EVENT_CLIENT_REREGISTER: + return "IBV_EVENT_CLIENT_REREGISTER"; + case IBV_EVENT_GID_CHANGE: + return "IBV_EVENT_GID_CHANGE"; + case IBV_EXP_EVENT_DCT_KEY_VIOLATION: + return "IBV_EXP_EVENT_DCT_KEY_VIOLATION"; + case IBV_EVENT_QP_ACCESS_ERR: + return "IBV_EVENT_QP_ACCESS_ERR"; + + case IBV_EVENT_CQ_ERR: + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + case IBV_EVENT_QP_LAST_WQE_REACHED: + default: + return "unexpected"; + } +} + +static void *poll_async(void *arg) +{ + struct dc_ctx *ctx = arg; + struct ibv_async_event event; + struct ibv_exp_arm_attr attr; + int err; + + while (1) { + attr.comp_mask = 0; + err = ibv_exp_arm_dct(ctx->dct, &attr); + if (err) { + fprintf(stderr, "arm dct failed %d\n", err); + return NULL; + } + if (ibv_get_async_event(ctx->ctx, &event)) + return NULL; + + printf(" event_type %s (%d)\n", + event_name_str(event.event_type), + event.event_type); + + ibv_ack_async_event(&event); + } + + return NULL; +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + char *ib_devname = NULL; + int iters = 0; + int use_event = 0; + int err; + struct dc_ctx ctx = { + .port = 18515, + .ib_port = 1, + .dct_key = 0x1234, + .size = 4096, + .mtu = IBV_MTU_2048, + .inl = 0, + }; + int i; + uint32_t srqn; + int mtu; + struct ibv_exp_device_attr dattr; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "dc-key", .has_arg = 1, .val = 'k' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "inline", .has_arg = 1, .val = 'l' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:i:s:n:ek:m:l:", long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + ctx.port = strtol(optarg, NULL, 0); + if (ctx.port < 0 || ctx.port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'l': + ctx.inl = strtol(optarg, NULL, 0); + if (ctx.inl < 0) { + usage(argv[0]); + return 1; + } + break; + + case 'i': + ctx.ib_port = strtol(optarg, NULL, 0); + if (ctx.ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + + case 'm': + mtu = strtol(optarg, NULL, 0); + if (to_ib_mtu(mtu, &ctx.mtu)) { + printf("invalid MTU %d\n", mtu); + return 1; + } + break; + + case 's': + ctx.size = strtol(optarg, NULL, 0); + break; + + case 'n': + iters = strtol(optarg, NULL, 0); + break; + + case 'e': + ++use_event; + break; + + case 'k': + ctx.dct_key = strtoull(optarg, NULL, 0); + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind < argc) { + usage(argv[0]); + return 1; + } + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx.ctx = ibv_open_device(ib_dev); + if (!ctx.ctx) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + return 1; + } + + dattr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS | + IBV_EXP_DEVICE_DC_RD_REQ | + IBV_EXP_DEVICE_DC_RD_RES; + err = ibv_exp_query_device(ctx.ctx, &dattr); + if (err) { + printf("couldn't query device extended attributes\n"); + return -1; + } else { + if (!(dattr.comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS)) { + printf("no extended capability flgas\n"); + return -1; + } + if (!(dattr.exp_device_cap_flags & IBV_EXP_DEVICE_DC_TRANSPORT)) { + printf("DC transport not enabled\n"); + return -1; + } + + if (!(dattr.comp_mask & IBV_EXP_DEVICE_DC_RD_REQ)) { + printf("no report on max requestor rdma/atomic resources\n"); + return -1; + } + + if (!(dattr.comp_mask & IBV_EXP_DEVICE_DC_RD_RES)) { + printf("no report on max responder rdma/atomic resources\n"); + return -1; + } + } + + ctx.pd = ibv_alloc_pd(ctx.ctx); + if (!ctx.pd) { + fprintf(stderr, "failed to allocate pd\n"); + return 1; + } + + ctx.length = 32 * ctx.size; + ctx.addr = malloc(ctx.length); + if (!ctx.addr) { + fprintf(stderr, "failed to allocate memory\n"); + return -1; + } + + ctx.mr = ibv_reg_mr(ctx.pd, ctx.addr, ctx.length, + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + if (!ctx.mr) { + fprintf(stderr, "failed to create mr\n"); + return -1; + } + + ctx.cq = ibv_create_cq(ctx.ctx, 128, NULL, NULL, 0); + if (!ctx.cq) { + fprintf(stderr, "failed to create cq\n"); + return -1; + } + + + { + struct ibv_srq_init_attr attr = { + .attr = { + .max_wr = 100, + .max_sge = 1 + } + }; + + ctx.srq = ibv_create_srq(ctx.pd, &attr); + if (!ctx.srq) { + fprintf(stderr, "Couldn't create SRQ\n"); + return -1; + } + ibv_get_srq_num(ctx.srq, &srqn); + } + + err = post_recv(&ctx, 100); + if (err != 100) { + fprintf(stderr, "posted %d out of %d receive buffers\n", err, 100); + return -1; + } + + { + struct ibv_exp_dct_init_attr dctattr = { + .pd = ctx.pd, + .cq = ctx.cq, + .srq = ctx.srq, + .dc_key = ctx.dct_key, + .port = ctx.ib_port, + .access_flags = IBV_ACCESS_REMOTE_WRITE, + .min_rnr_timer = 2, + .tclass = 0, + .flow_label = 0, + .mtu = ctx.mtu, + .pkey_index = 0, + .gid_index = 0, + .hop_limit = 1, + .create_flags = 0, + .inline_size = ctx.inl, + }; + + ctx.dct = ibv_exp_create_dct(ctx.ctx, &dctattr); + if (!ctx.dct) { + printf("create dct failed\n"); + return -1; + } + + { + struct ibv_exp_dct_attr dcqattr; + + dcqattr.comp_mask = 0; + err = ibv_exp_query_dct(ctx.dct, &dcqattr); + if (err) { + printf("query dct failed\n"); + return -1; + } else if (dcqattr.dc_key != ctx.dct_key) { + printf("queried dckry (0x%llx) is different then provided at create (0x%llx)\n", + (unsigned long long)dcqattr.dc_key, + (unsigned long long)ctx.dct_key); + return -1; + } else if (dcqattr.state != IBV_EXP_DCT_STATE_ACTIVE) { + printf("state is not active %d\n", dcqattr.state); + return -1; + } + } + + printf("local address: DCTN 0x%06x, SRQN 0x%06x, DCKEY 0x%016llx\n", + ctx.dct->dct_num, srqn, (unsigned long long)ctx.dct_key); + } + + if (ibv_query_port(ctx.ctx, ctx.ib_port, &ctx.portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + + ctx.thread_active = 1; + err = pthread_create(&ctx.thread, NULL, handle_clients, &ctx); + if (err) { + perror("thread create faild:"); + return -1; + } + + err = pthread_create(&ctx.poll_thread, NULL, poll_async, &ctx); + if (err) { + perror("thread create faild:"); + return -1; + } + + for (i = 0; i < iters || iters == 0; ++i) { + int num; + struct ibv_wc wc; + + do { + num = ibv_poll_cq(ctx.cq, 1, &wc); + if (num < 0) { + fprintf(stderr, "failed to poll cq\n"); + return -1; + } + } while (!num); + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "completion with error:\n"); + fprintf(stderr, "status: %d\n", wc.status); + } else { + if (post_recv(&ctx, 1) != 1) { + fprintf(stderr, "failed to post receive buffer\n"); + return -1; + } + } + } + printf("test finished successfully\n"); + ctx.thread_active = 0; + if (pthread_cancel(ctx.thread)) + printf("pthread_cancel failed\n"); + + if (pthread_cancel(ctx.poll_thread)) + printf("pthread_cancel failed\n"); + + err = pthread_join(ctx.thread, NULL); + if (err) { + perror("thread join faild:"); + return -1; + } + if (ibv_exp_destroy_dct(ctx.dct)) + printf("destroy dct failed\n"); + + return 0; +} Index: contrib/ofed/libibverbs/examples/device_list.c =================================================================== --- contrib/ofed/libibverbs/examples/device_list.c +++ contrib/ofed/libibverbs/examples/device_list.c @@ -36,6 +36,9 @@ #include +#include +#include + #include #include Index: contrib/ofed/libibverbs/examples/devinfo.c =================================================================== --- contrib/ofed/libibverbs/examples/devinfo.c +++ contrib/ofed/libibverbs/examples/devinfo.c @@ -36,11 +36,15 @@ #endif /* HAVE_CONFIG_H */ #include +#include #include #include #include #include #include +#include +#include +#include #include #include @@ -68,9 +72,10 @@ static const char *transport_str(enum ibv_transport_type transport) { switch (transport) { - case IBV_TRANSPORT_IB: return "InfiniBand"; - case IBV_TRANSPORT_IWARP: return "iWARP"; - default: return "invalid transport"; + case IBV_TRANSPORT_IB: return "InfiniBand"; + case IBV_TRANSPORT_IWARP: return "iWARP"; + case IBV_EXP_TRANSPORT_SCIF: return "SCIF"; + default: return "invalid transport"; } } @@ -99,12 +104,13 @@ } } -static const char *atomic_cap_str(enum ibv_atomic_cap atom_cap) +static const char *exp_atomic_cap_str(enum ibv_exp_atomic_cap exp_atom_cap) { - switch (atom_cap) { - case IBV_ATOMIC_NONE: return "ATOMIC_NONE"; - case IBV_ATOMIC_HCA: return "ATOMIC_HCA"; - case IBV_ATOMIC_GLOB: return "ATOMIC_GLOB"; + switch (exp_atom_cap) { + case IBV_EXP_ATOMIC_NONE: return "ATOMIC_NONE"; + case IBV_EXP_ATOMIC_HCA: return "ATOMIC_HCA"; + case IBV_EXP_ATOMIC_GLOB: return "ATOMIC_GLOB"; + case IBV_EXP_ATOMIC_HCA_REPLY_BE: return "ATOMIC_HCA_REPLY_BE"; default: return "invalid atomic capability"; } } @@ -137,7 +143,13 @@ switch (speed) { case 1: return "2.5 Gbps"; case 2: return "5.0 Gbps"; - case 4: return "10.0 Gbps"; + + case 4: /* fall through */ + case 8: return "10.0 Gbps"; + + case 16: return "14.0 Gbps"; + case 32: return "25.0 Gbps"; + case 64: return "50.0 Gbps"; default: return "invalid speed"; } } @@ -187,18 +199,213 @@ switch (link_layer) { case IBV_LINK_LAYER_UNSPECIFIED: case IBV_LINK_LAYER_INFINIBAND: - return "IB"; + return "InfiniBand"; case IBV_LINK_LAYER_ETHERNET: return "Ethernet"; + case IBV_EXP_LINK_LAYER_SCIF: + return "SCIF"; default: return "Unknown"; } } +static void print_caps(int caps) +{ + int unknown_flags = ~(IBV_DEVICE_RESIZE_MAX_WR | + IBV_DEVICE_BAD_PKEY_CNTR | + IBV_DEVICE_BAD_QKEY_CNTR | + IBV_DEVICE_RAW_MULTI | + IBV_DEVICE_AUTO_PATH_MIG | + IBV_DEVICE_CHANGE_PHY_PORT | + IBV_DEVICE_UD_AV_PORT_ENFORCE | + IBV_DEVICE_CURR_QP_STATE_MOD | + IBV_DEVICE_SHUTDOWN_PORT | + IBV_DEVICE_INIT_TYPE | + IBV_DEVICE_PORT_ACTIVE_EVENT | + IBV_DEVICE_SYS_IMAGE_GUID | + IBV_DEVICE_RC_RNR_NAK_GEN | + IBV_DEVICE_SRQ_RESIZE | + IBV_DEVICE_N_NOTIFY_CQ | + IBV_DEVICE_XRC); + + if (caps & IBV_DEVICE_RESIZE_MAX_WR) + printf("\t\t\t\t\tRESIZE_MAX_WR\n"); + if (caps & IBV_DEVICE_BAD_PKEY_CNTR) + printf("\t\t\t\t\tBAD_PKEY_CNTR\n"); + if (caps & IBV_DEVICE_BAD_QKEY_CNTR) + printf("\t\t\t\t\tBAD_QKEY_CNTR\n"); + if (caps & IBV_DEVICE_RAW_MULTI) + printf("\t\t\t\t\tRAW_MULTI\n"); + if (caps & IBV_DEVICE_AUTO_PATH_MIG) + printf("\t\t\t\t\tAUTO_PATH_MIG\n"); + if (caps & IBV_DEVICE_CHANGE_PHY_PORT) + printf("\t\t\t\t\tCHANGE_PHY_PORT\n"); + if (caps & IBV_DEVICE_UD_AV_PORT_ENFORCE) + printf("\t\t\t\t\tUD_AV_PORT_ENFORCE\n"); + if (caps & IBV_DEVICE_CURR_QP_STATE_MOD) + printf("\t\t\t\t\tCURR_QP_STATE_MOD\n"); + if (caps & IBV_DEVICE_SHUTDOWN_PORT) + printf("\t\t\t\t\tSHUTDOWN_PORT\n"); + if (caps & IBV_DEVICE_INIT_TYPE) + printf("\t\t\t\t\tINIT_TYPE\n"); + if (caps & IBV_DEVICE_PORT_ACTIVE_EVENT) + printf("\t\t\t\t\tPORT_ACTIVE_EVENT\n"); + if (caps & IBV_DEVICE_SYS_IMAGE_GUID) + printf("\t\t\t\t\tSYS_IMAGE_GUID\n"); + if (caps & IBV_DEVICE_RC_RNR_NAK_GEN) + printf("\t\t\t\t\tRC_RNR_NAK_GEN\n"); + if (caps & IBV_DEVICE_SRQ_RESIZE) + printf("\t\t\t\t\tSRQ_RESIZE\n"); + if (caps & IBV_DEVICE_N_NOTIFY_CQ) + printf("\t\t\t\t\tN_NOTIFY_CQ\n"); + if (caps & IBV_DEVICE_XRC) + printf("\t\t\t\t\tXRC\n"); + if (caps & unknown_flags) + printf("\t\t\t\t\tUnknown flags: 0x%08x\n", caps & unknown_flags); +} + +static void print_caps_exp(uint64_t caps) +{ + uint64_t unknown_flags = ~(IBV_EXP_DEVICE_DC_TRANSPORT | + IBV_EXP_DEVICE_QPG | + IBV_EXP_DEVICE_UD_RSS | + IBV_EXP_DEVICE_UD_TSS | + IBV_EXP_DEVICE_MEM_WINDOW | + IBV_EXP_DEVICE_MEM_MGT_EXTENSIONS | + IBV_EXP_DEVICE_MW_TYPE_2A | + IBV_EXP_DEVICE_MW_TYPE_2B | + IBV_EXP_DEVICE_CROSS_CHANNEL | + IBV_EXP_DEVICE_MANAGED_FLOW_STEERING | + IBV_EXP_DEVICE_MR_ALLOCATE | + IBV_EXP_DEVICE_MR_ALLOCATE | + IBV_EXP_DEVICE_EXT_ATOMICS | + IBV_EXP_DEVICE_NOP | + IBV_EXP_DEVICE_UMR | + IBV_EXP_DEVICE_ODP | + IBV_EXP_DEVICE_VXLAN_SUPPORT | + IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT | + IBV_EXP_DEVICE_RX_CSUM_IP_PKT | + IBV_EXP_DEVICE_DC_INFO); + + if (caps & IBV_EXP_DEVICE_DC_TRANSPORT) + printf("\t\t\t\t\tEXP_DC_TRANSPORT\n"); + if (caps & IBV_EXP_DEVICE_QPG) + printf("\t\t\t\t\tEXP_DEVICE_QPG\n"); + if (caps & IBV_EXP_DEVICE_UD_RSS) + printf("\t\t\t\t\tEXP_UD_RSS\n"); + if (caps & IBV_EXP_DEVICE_UD_TSS) + printf("\t\t\t\t\tEXP_UD_TSS\n"); + if (caps & IBV_EXP_DEVICE_MEM_WINDOW) + printf("\t\t\t\t\tEXP_MEM_WINDOW\n"); + if (caps & IBV_EXP_DEVICE_MEM_MGT_EXTENSIONS) + printf("\t\t\t\t\tEXP_MEM_MGT_EXTENSIONS\n"); + if (caps & IBV_EXP_DEVICE_MW_TYPE_2A) + printf("\t\t\t\t\tEXP_MW_TYPE_2A\n"); + if (caps & IBV_EXP_DEVICE_MW_TYPE_2B) + printf("\t\t\t\t\tEXP_MW_TYPE_2B\n"); + if (caps & IBV_EXP_DEVICE_CROSS_CHANNEL) + printf("\t\t\t\t\tEXP_CROSS_CHANNEL\n"); + if (caps & IBV_EXP_DEVICE_MANAGED_FLOW_STEERING) + printf("\t\t\t\t\tEXP_MANAGED_FLOW_STEERING\n"); + if (caps & IBV_EXP_DEVICE_MR_ALLOCATE) + printf("\t\t\t\t\tEXP_MR_ALLOCATE\n"); + if (caps & IBV_EXP_DEVICE_SHARED_MR) + printf("\t\t\t\t\tEXP_SHARED_MR\n"); + if (caps & IBV_EXP_DEVICE_EXT_ATOMICS) + printf("\t\t\t\t\tEXT_ATOMICS\n"); + if (caps & IBV_EXP_DEVICE_NOP) + printf("\t\t\t\t\tEXT_SEND NOP\n"); + if (caps & IBV_EXP_DEVICE_UMR) + printf("\t\t\t\t\tEXP_UMR\n"); + if (caps & IBV_EXP_DEVICE_ODP) + printf("\t\t\t\t\tEXP_ODP\n"); + if (caps & IBV_EXP_DEVICE_VXLAN_SUPPORT) + printf("\t\t\t\t\tEXP_VXLAN_SUPPORT\n"); + if (caps & IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) + printf("\t\t\t\t\tEXP_RX_CSUM_TCP_UDP_PKT\n"); + if (caps & IBV_EXP_DEVICE_RX_CSUM_IP_PKT) + printf("\t\t\t\t\tEXP_RX_CSUM_IP_PKT\n"); + if (caps & IBV_EXP_DEVICE_DC_INFO) + printf("\t\t\t\t\tEXP_DC_INFO\n"); + if (caps & unknown_flags) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX64 "\n", caps & unknown_flags); +} + +void print_odp_trans_caps(uint32_t trans) +{ + uint32_t unknown_transport_caps = ~(IBV_EXP_ODP_SUPPORT_SEND | + IBV_EXP_ODP_SUPPORT_RECV | + IBV_EXP_ODP_SUPPORT_WRITE | + IBV_EXP_ODP_SUPPORT_READ | + IBV_EXP_ODP_SUPPORT_ATOMIC | + IBV_EXP_ODP_SUPPORT_SRQ_RECV); + + if (!trans) + printf("\t\t\t\t\tNO SUPPORT\n"); + else { + if (trans & IBV_EXP_ODP_SUPPORT_SEND) + printf("\t\t\t\t\tSUPPORT_SEND\n"); + if (trans & IBV_EXP_ODP_SUPPORT_RECV) + printf("\t\t\t\t\tSUPPORT_RECV\n"); + if (trans & IBV_EXP_ODP_SUPPORT_WRITE) + printf("\t\t\t\t\tSUPPORT_WRITE\n"); + if (trans & IBV_EXP_ODP_SUPPORT_READ) + printf("\t\t\t\t\tSUPPORT_READ\n"); + if (trans & IBV_EXP_ODP_SUPPORT_ATOMIC) + printf("\t\t\t\t\tSUPPORT_ATOMIC\n"); + if (trans & IBV_EXP_ODP_SUPPORT_SRQ_RECV) + printf("\t\t\t\t\tSUPPORT_SRQ_RECV\n"); + if (trans & unknown_transport_caps) + printf("\t\t\t\t\tUnkown flags: 0x%" PRIX32 "\n", + trans & unknown_transport_caps); + } +} + +void print_odp_caps(struct ibv_exp_odp_caps caps) +{ + uint64_t unknown_general_caps = ~(IBV_EXP_ODP_SUPPORT); + + /* general odp caps */ + printf("\tgeneral_odp_caps:\n"); + if (caps.general_odp_caps & IBV_EXP_ODP_SUPPORT) + printf("\t\t\t\t\tODP_SUPPORT\n"); + if (caps.general_odp_caps & unknown_general_caps) + printf("\t\t\t\t\tUnkown flags: 0x%" PRIX64 "\n", + caps.general_odp_caps & unknown_general_caps); + + /* RC transport */ + printf("\trc_odp_caps:\n"); + print_odp_trans_caps(caps.per_transport_caps.rc_odp_caps); + printf("\tuc_odp_caps:\n"); + print_odp_trans_caps(caps.per_transport_caps.uc_odp_caps); + printf("\tud_odp_caps:\n"); + print_odp_trans_caps(caps.per_transport_caps.ud_odp_caps); + printf("\tdc_odp_caps:\n"); + print_odp_trans_caps(caps.per_transport_caps.dc_odp_caps); + printf("\txrc_odp_caps:\n"); + print_odp_trans_caps(caps.per_transport_caps.xrc_odp_caps); + printf("\traw_eth_odp_caps:\n"); + print_odp_trans_caps(caps.per_transport_caps.raw_eth_odp_caps); +} + +static char *qp_type_flag_str(enum ibv_exp_supported_qp_types qp_type_flag) +{ + switch (qp_type_flag) { + case IBV_EXP_QPT_RC: return "RC"; + case IBV_EXP_QPT_UC: return "UC"; + case IBV_EXP_QPT_UD: return "UD"; + case IBV_EXP_QPT_XRC_INIT: return "XRC_INIT"; + case IBV_EXP_QPT_XRC_TGT: return "XRC_TGT"; + case IBV_EXP_QPT_RAW_PACKET: return "RAW_PACKET"; + default: return "UNKNOWN"; + } +} + static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) { struct ibv_context *ctx; - struct ibv_device_attr device_attr; + struct ibv_exp_device_attr device_attr; + struct ibv_device_attr device_legacy_attr; struct ibv_port_attr port_attr; int rc = 0; uint8_t port; @@ -210,10 +417,18 @@ rc = 1; goto cleanup; } - if (ibv_query_device(ctx, &device_attr)) { - fprintf(stderr, "Failed to query device props"); - rc = 2; - goto cleanup; + + memset(&device_attr, 0, sizeof(device_attr)); + device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_RESERVED - 1; + + if (ibv_exp_query_device(ctx, &device_attr)) { + if (ibv_query_device(ctx, &device_legacy_attr)) { + fprintf(stderr, "Failed to query device props\n"); + rc = 2; + goto cleanup; + } + + memcpy(&device_attr, &device_legacy_attr, sizeof(device_legacy_attr)); } printf("hca_id:\t%s\n", ibv_get_device_name(ib_dev)); @@ -239,7 +454,12 @@ (unsigned long long) device_attr.page_size_cap); printf("\tmax_qp:\t\t\t\t%d\n", device_attr.max_qp); printf("\tmax_qp_wr:\t\t\t%d\n", device_attr.max_qp_wr); - printf("\tdevice_cap_flags:\t\t0x%08x\n", device_attr.device_cap_flags); + printf("\tdevice_cap_flags:\t\t0x%08x\n", + (int)(device_attr.exp_device_cap_flags & (IBV_EXP_START_FLAG - 1))); + print_caps(device_attr.exp_device_cap_flags & (IBV_EXP_START_FLAG - 1)); + printf("\tdevice_cap_exp_flags:\t\t0x%" PRIX64 "\n", + device_attr.exp_device_cap_flags & ~(IBV_EXP_START_FLAG - 1)); + print_caps_exp(device_attr.exp_device_cap_flags & ~(IBV_EXP_START_FLAG - 1)); printf("\tmax_sge:\t\t\t%d\n", device_attr.max_sge); printf("\tmax_sge_rd:\t\t\t%d\n", device_attr.max_sge_rd); printf("\tmax_cq:\t\t\t\t%d\n", device_attr.max_cq); @@ -252,7 +472,11 @@ printf("\tmax_qp_init_rd_atom:\t\t%d\n", device_attr.max_qp_init_rd_atom); printf("\tmax_ee_init_rd_atom:\t\t%d\n", device_attr.max_ee_init_rd_atom); printf("\tatomic_cap:\t\t\t%s (%d)\n", - atomic_cap_str(device_attr.atomic_cap), device_attr.atomic_cap); + exp_atomic_cap_str(device_attr.exp_atomic_cap), + device_attr.exp_atomic_cap); + printf("\tlog atomic arg sizes (mask)\t\t%" PRIx64 "\n", device_attr.ext_atom.log_atomic_arg_sizes); + printf("\tmax fetch and add bit boundary\t%d\n", device_attr.ext_atom.max_fa_bit_boundary); + printf("\tlog max atomic inline\t\t%d\n", device_attr.ext_atom.log_max_atomic_inline); printf("\tmax_ee:\t\t\t\t%d\n", device_attr.max_ee); printf("\tmax_rdd:\t\t\t%d\n", device_attr.max_rdd); printf("\tmax_mw:\t\t\t\t%d\n", device_attr.max_mw); @@ -273,8 +497,47 @@ } printf("\tmax_pkeys:\t\t\t%d\n", device_attr.max_pkeys); printf("\tlocal_ca_ack_delay:\t\t%d\n", device_attr.local_ca_ack_delay); + printf("\thca_core_clock:\t\t\t%" PRIu64 "\n", + device_attr.hca_core_clock); + printf("\tmax_klm_list_size:\t\t%d\n", device_attr.umr_caps.max_klm_list_size); + printf("\tmax_send_wqe_inline_klms:\t%d\n", device_attr.umr_caps.max_send_wqe_inline_klms); + printf("\tmax_umr_recursion_depth:\t%d\n", device_attr.umr_caps.max_umr_recursion_depth); + printf("\tmax_umr_stride_dimension:\t%d\n", device_attr.umr_caps.max_umr_stride_dimension); + print_odp_caps(device_attr.odp_caps); + printf("\tmax_dct:\t\t\t%d\n", device_attr.max_dct); + printf("\tmax_device_ctx:\t\t\t%d\n", device_attr.max_device_ctx); + if ((device_attr.comp_mask & IBV_EXP_DEVICE_ATTR_MP_RQ) && + device_attr.mp_rq_caps.supported_qps) { + enum ibv_exp_supported_qp_types qp_type_flag = IBV_EXP_QPT_RC; + uint32_t unknown_shifts_flags = device_attr.mp_rq_caps.allowed_shifts & + ~IBV_EXP_MP_RQ_2BYTES_SHIFT; + + printf("\tMulti-Packet RQ supported\n"); + printf("\t\tSupported for QP types: "); + while (qp_type_flag < IBV_EXP_QPT_RESERVED) { + if (device_attr.mp_rq_caps.supported_qps & qp_type_flag) + printf("%s ", qp_type_flag_str(qp_type_flag)); + qp_type_flag <<= 1; + } + printf("\n"); + printf("\t\tSupported payload shifts:\n"); + if (device_attr.mp_rq_caps.allowed_shifts & IBV_EXP_MP_RQ_2BYTES_SHIFT) + printf("\t\t\t2 bytes\n"); + if (unknown_shifts_flags) + printf("\t\t\tUnknown payload shift flags (0x%x)\n", unknown_shifts_flags); + printf("\t\tLog number of strides for single WQE: %d - %d\n", + device_attr.mp_rq_caps.min_single_wqe_log_num_of_strides, + device_attr.mp_rq_caps.max_single_wqe_log_num_of_strides); + printf("\t\tLog number of bytes in single stride: %d - %d\n", + device_attr.mp_rq_caps.min_single_stride_log_num_of_bytes, + device_attr.mp_rq_caps.max_single_stride_log_num_of_bytes); + } else { + printf("\tMulti-Packet RQ is not supported\n"); + } } + if (device_attr.phys_port_cnt) + printf("\tDevice ports:\n"); for (port = 1; port <= device_attr.phys_port_cnt; ++port) { /* if in the command line the user didn't ask for info about this port */ if ((ib_port) && (port != ib_port)) @@ -295,7 +558,8 @@ printf("\t\t\tsm_lid:\t\t\t%d\n", port_attr.sm_lid); printf("\t\t\tport_lid:\t\t%d\n", port_attr.lid); printf("\t\t\tport_lmc:\t\t0x%02x\n", port_attr.lmc); - printf("\t\t\tlink_layer:\t\t%s\n", link_layer_str(port_attr.link_layer)); + printf("\t\t\tlink_layer:\t\t%s\n", + link_layer_str(port_attr.link_layer)); if (verbose) { printf("\t\t\tmax_msg_sz:\t\t0x%x\n", port_attr.max_msg_sz); @@ -335,8 +599,8 @@ printf("Usage: %s print the ca attributes\n", argv0); printf("\n"); printf("Options:\n"); - printf(" -d, --ib-dev= use IB device (default first device found)\n"); - printf(" -i, --ib-port= use port of IB device (default all ports)\n"); + printf(" -d, --ib-dev= use IB device (default all devices found)\n"); + printf(" -i, --ib-port= use port of IB device (default 0: all ports)\n"); printf(" -l, --list print only the IB devices names\n"); printf(" -v, --verbose print all the attributes of the IB device(s)\n"); } Index: contrib/ofed/libibverbs/examples/get_clock.h =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/get_clock.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * Author: Michael S. Tsirkin + */ + +#ifndef GET_CLOCK_H +#define GET_CLOCK_H + +#if defined(__x86_64__) || defined(__i386__) +/* Note: only x86 CPUs which have rdtsc instruction are supported. */ +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) +{ + unsigned low, high; + unsigned long long val; + + asm volatile ("rdtsc" : "=a" (low), "=d" (high)); + val = high; + val = (val << 32) | low; + return val; +} +#elif defined(__PPC__) || defined(__PPC64__) +/* Note: only PPC CPUs which have mftb instruction are supported. */ +/* PPC64 has mftb */ +typedef unsigned long cycles_t; +static inline cycles_t get_cycles(void) +{ + cycles_t ret; + + asm volatile ("mftb %0" : "=r" (ret) : ); + return ret; +} +#elif defined(__ia64__) +/* Itanium2 and up has ar.itc (Itanium1 has errata) */ +typedef unsigned long cycles_t; +static inline cycles_t get_cycles(void) +{ + cycles_t ret; + + asm volatile ("mov %0=ar.itc" : "=r" (ret)); + return ret; +} +#elif defined(__s390x__) +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) +{ + cycles_t clk; + + asm volatile("stck %0" : "=Q" (clk) : : "cc"); + return clk >> 2; +} +#elif defined(__sparc__) && defined(__arch64__) +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) +{ + cycles_t v; + + asm volatile ("rd %%tick, %0" : "=r" (v) : ); + return v; +} +#elif defined(__aarch64__) + +typedef unsigned long cycles_t; +static inline cycles_t get_cycles(void) +{ + cycles_t cval; + + asm volatile("isb" : : : "memory"); + asm volatile("mrs %0, cntvct_el0" : "=r" (cval)); + return cval; +} + +#else +#warning get_cycles not implemented for this architecture: attempt asm/timex.h +#include +#endif + +extern double get_cpu_mhz(int); + +#endif Index: contrib/ofed/libibverbs/examples/get_clock.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/get_clock.c @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * Author: Michael S. Tsirkin + */ + +/* #define DEBUG 1 */ +/* #define DEBUG_DATA 1 */ +/* #define GET_CPU_MHZ_FROM_PROC 1 */ + +/* For gettimeofday */ +#define _DEFAULT_SOURCE +#define _BSD_SOURCE +#include + +#include +#include +#include +#include "get_clock.h" + +#ifndef DEBUG +#define DEBUG 0 +#endif +#ifndef DEBUG_DATA +#define DEBUG_DATA 0 +#endif + +#define MEASUREMENTS 200 +#define USECSTEP 10 +#define USECSTART 100 + +/* + Use linear regression to calculate cycles per microsecond. +http://en.wikipedia.org/wiki/Linear_regression#Parameter_estimation +*/ +static double sample_get_cpu_mhz(void) +{ + struct timeval tv1, tv2; + cycles_t start; + double sx = 0, sy = 0, sxx = 0, syy = 0, sxy = 0; + double tx, ty; + int i; + + /* Regression: y = a + b x */ + long x[MEASUREMENTS]; + cycles_t y[MEASUREMENTS]; + double a; /* system call overhead in cycles */ + double b; /* cycles per microsecond */ + double r_2; + + for (i = 0; i < MEASUREMENTS; ++i) { + start = get_cycles(); + + if (gettimeofday(&tv1, NULL)) { + fprintf(stderr, "gettimeofday failed.\n"); + return 0; + } + + do { + if (gettimeofday(&tv2, NULL)) { + fprintf(stderr, "gettimeofday failed.\n"); + return 0; + } + } while ((tv2.tv_sec - tv1.tv_sec) * 1000000 + + (tv2.tv_usec - tv1.tv_usec) < USECSTART + i * USECSTEP); + + x[i] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + + tv2.tv_usec - tv1.tv_usec; + y[i] = get_cycles() - start; + if (DEBUG_DATA) + fprintf(stderr, "x=%ld y=%lld\n", x[i], (long long)y[i]); + } + + for (i = 0; i < MEASUREMENTS; ++i) { + tx = x[i]; + ty = y[i]; + sx += tx; + sy += ty; + sxx += tx * tx; + syy += ty * ty; + sxy += tx * ty; + } + + b = (MEASUREMENTS * sxy - sx * sy) / (MEASUREMENTS * sxx - sx * sx); + a = (sy - b * sx) / MEASUREMENTS; + + if (DEBUG) + fprintf(stderr, "a = %g\n", a); + if (DEBUG) + fprintf(stderr, "b = %g\n", b); + if (DEBUG) + fprintf(stderr, "a / b = %g\n", a / b); + r_2 = (MEASUREMENTS * sxy - sx * sy) * (MEASUREMENTS * sxy - sx * sy) / + (MEASUREMENTS * sxx - sx * sx) / + (MEASUREMENTS * syy - sy * sy); + + if (DEBUG) + fprintf(stderr, "r^2 = %g\n", r_2); + if (r_2 < 0.9) { + fprintf(stderr, "Correlation coefficient r^2: %g < 0.9\n", r_2); + return 0; + } + + return b; +} + +#ifndef __s390x__ +static double proc_get_cpu_mhz(int no_cpu_freq_fail) +{ + FILE *f; + char buf[256]; + double mhz = 0.0; + int print_flag = 0; + double delta; + + f = fopen("/proc/cpuinfo", "r"); + if (!f) + return 0.0; + while (fgets(buf, sizeof(buf), f)) { + double m; + int rc; + + #if defined(__ia64__) + /* Use the ITC frequency on IA64 */ + rc = sscanf(buf, "itc MHz : %lf", &m); + #elif defined(__PPC__) || defined(__PPC64__) + /* PPC has a different format as well */ + rc = sscanf(buf, "clock : %lf", &m); + #elif defined(__sparc__) && defined(__arch64__) + /* + * on sparc the /proc/cpuinfo lines that hold + * the cpu freq in HZ are as follow: + * Cpu{cpu-num}ClkTck : 00000000a9beeee4 + */ + char *s; + unsigned val; + + s = strstr(buf, "ClkTck\t: "); + if (!s) + continue; + s += (strlen("ClkTck\t: ") - strlen("0x")); + strncpy(s, "0x", strlen("0x")); + rc = sscanf(s, "%x", &val); + m = val/1000000; + #else + rc = sscanf(buf, "cpu MHz : %lf", &m); + #endif + + if (rc != 1) + continue; + + if (mhz == 0.0) { + mhz = m; + continue; + } + delta = mhz > m ? mhz - m : m - mhz; + if ((delta / mhz > 0.02) && (print_flag == 0)) { + print_flag = 1; + fprintf(stderr, "Conflicting CPU frequency values" + " detected: %lf != %lf\n", mhz, m); + if (no_cpu_freq_fail) { + fprintf(stderr, "Test integrity may be harmed !\n"); + } else { + fclose(f); + return 0.0; + } + continue; + } + } + + fclose(f); + return mhz; +} +#endif + +double get_cpu_mhz(int no_cpu_freq_fail) +{ + #ifdef __s390x__ + return sample_get_cpu_mgz(); + #else + double sample, proc, delta; + + sample = sample_get_cpu_mhz(); + proc = proc_get_cpu_mhz(no_cpu_freq_fail); + #ifdef __aarch64__ + if (proc < 1) + proc = sample; + #endif + if (!proc || !sample) + return 0; + + delta = proc > sample ? proc - sample : sample - proc; + if (delta / proc > 0.02) { + #if !defined(__PPC__) && !defined(__PPC64__) + fprintf(stderr, "Warning: measured timestamp frequency " + "%g differs from nominal %g MHz\n", + sample, proc); + if (!no_cpu_freq_fail) + fprintf(stderr, " Add --CPU-freq flag to show report\n"); + #endif + return sample; + } + return proc; + #endif +} Index: contrib/ofed/libibverbs/examples/intf.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/intf.c @@ -0,0 +1,2109 @@ +/* + * Copyright (c) 2015 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * Author: Moshe Lazer + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "get_clock.h" + +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define likely(x) (x) +#endif +#endif + + +#ifndef unlikely +#ifdef __GNUC__ +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define unlikely(x) (x) +#endif +#endif + + +enum input_send_intf { + IN_NORMAL_SEND_INTF, + IN_ACC_SEND_PENDING_INTF, + IN_ACC_SEND_PENDING_INL_INTF, + IN_ACC_SEND_PENDING_SG_LIST_INTF, + IN_ACC_SEND_BURST_INTF, + IN_NUM_SEND_INTF +}; + +enum input_recv_intf { + IN_NORMAL_RECV_INTF, + IN_ACC_RECV_BURST_INTF, + IN_NUM_RECV_INTF, +}; + +enum input_poll_intf { + IN_NORMAL_POLL_INTF, + IN_ACC_POLL_CNT_INTF, + IN_ACC_POLL_LENGTH_INTF, + IN_NUM_POLL_INTF +}; + +enum qp_intf { + NORMAL_SEND_INTF, + NORMAL_RECV_INTF, + ACC_SEND_PENDING_INTF, + ACC_SEND_PENDING_INL_INTF, + ACC_SEND_PENDING_SG_LIST_INTF, + ACC_SEND_BURST_INTF, + ACC_RECV_BURST_INTF, +}; + +enum cq_intf { + NORMAL_POLL_INTF, + ACC_POLL_CNT_INTF, + ACC_POLL_LENGTH_INTF, + ACC_POLL_LENGTH_INL_INTF, +}; + +struct qp_params { + int wr_burst; + int max_send_wr; + int max_recv_wr; + int max_inl_recv_data; + enum input_send_intf verbs_send_intf; + enum input_recv_intf verbs_recv_intf; + enum input_poll_intf verbs_send_poll_intf; + enum input_poll_intf verbs_recv_poll_intf; +}; + +struct send_params { + int msg_size; + int num_qp_msgs; +}; + +struct cpu_set { + int min; + int max; +}; + +#define MAX_CPU_SETS 4 +struct thread_params { + int num_threads; + int num_cpu_sets; + struct cpu_set cpu_sets[MAX_CPU_SETS]; +}; +#define MAX_DEV_NAME_SIZE 20 +struct ib_data { + char dev_name[MAX_DEV_NAME_SIZE]; + int ib_port_num; + int sl; + enum ibv_mtu mtu; + int check_data; + int use_res_domain; +}; + +#define MAX_SERVER_NAME_SIZE 128 +struct server_data { + char name[MAX_SERVER_NAME_SIZE]; + int port; +}; + +struct intf_input { + struct server_data server_data; + struct ib_data ib_data; + struct qp_params qp_prms; + struct send_params send_prms; + struct thread_params thread_prms; +}; + +struct intf_input intf_default_input = { + .server_data = { + .name = "", + .port = 18515 + }, + .ib_data = { + .dev_name = "mlx4_0", + .ib_port_num = 1, + .sl = 0, + .mtu = IBV_MTU_4096, + .check_data = 0, + .use_res_domain = 1, + }, + .qp_prms = { + .wr_burst = 10, /* burst-size: the number of messages to use in one send/receive transaction */ + .max_send_wr = 3*5*4*5*7, /* Defines the size of send queue in messages (must be multiplication of burst-size) */ + .max_recv_wr = 3*5*4*5*7, /* Defines the size of recive queue in messages (must be multiplication of burst-size) */ + .max_inl_recv_data = 0, /* max in-line receive data to use for QP creation */ + .verbs_send_intf = IN_ACC_SEND_PENDING_INTF, /* Defines which send interface to use */ + .verbs_recv_intf = IN_ACC_RECV_BURST_INTF, /* Defines which receive interface to use */ + .verbs_send_poll_intf = IN_ACC_POLL_CNT_INTF, /* Defines which poll interface to use for sent messages */ + .verbs_recv_poll_intf = IN_ACC_POLL_LENGTH_INTF, /* Defines which poll interface to use for received messages */ + }, + .send_prms = { + .msg_size = 64, /* msg size */ + .num_qp_msgs = 1000000, /* Number of messages to send via each QP */ + }, + + .thread_prms = { + .num_threads = 1, /* number of threads to use */ + .num_cpu_sets = 2, /* This field and the cpu_sets field define on which CPUs application threads may run */ + .cpu_sets = { {0, 5}, {12, 17} } + } +}; +struct intf_input intf_input; + +#define INVALID_DURATION ((unsigned long)(-1)) +struct qp_data { + int remote_qpn; + int psn; + int msg_size; + int msg_stride; + long num_msgs; + int wr_burst; + int max_wrs; + int max_inl_recv_data; + int max_inline_data; + enum qp_intf qp_intf; + unsigned long total_ms; + struct ibv_qp *qp; + struct ibv_sge *sg_list; + struct ibv_send_wr *send_wr; + struct ibv_recv_wr *recv_wr; + struct ibv_exp_qp_burst_family *qp_burst_family; + char *buf; + struct ibv_mr *mr; +}; + +struct cq_data { + int wc_burst; + int cq_size; + enum cq_intf cq_intf; + struct ibv_cq *cq; + struct ibv_wc *wc; + struct ibv_exp_cq_family *cq_family; +}; + +struct qp_cq_data { + int idx; + struct qp_data qp; + struct cq_data cq; +}; + +#define MAX_INLINE_RECV 512 +struct intf_context; +struct intf_thread { + struct intf_context *ctx; + struct ibv_exp_res_domain *single_res_domain; + char inlr_buf[MAX_INLINE_RECV]; + uint32_t use_inlr; + int qp_idx; + int thread_idx; + int cpu; + unsigned long cpu_freq; +}; + +#define MAX_MSG_SIZE 0x10000 + +struct ib_dest { + int lid; + union ibv_gid gid; + int *qpn; + int *psn; +}; + +struct intf_context { + char *servername; + int is_send; + int port; + char dev_name[MAX_DEV_NAME_SIZE]; + struct ibv_device *ib_dev; + struct ibv_context *context; + struct ibv_pd *pd; + int ib_port_num; + int sl; + enum ibv_mtu mtu; + int num_qps_cqs; + struct qp_cq_data *qps_cqs; + int num_threads; + struct intf_thread *threads; + sem_t threads_sem; + sem_t threads_done_sem; + int thread_stop; + struct ibv_exp_device_attr dattr; + struct ib_dest remote_dst; + struct ib_dest local_dst; + int check_data; + int use_res_domain; +}; + +sem_t clk_sem; + +static inline double clk_get_cpu_hz(int no_cpu_freq_fail) +{ + double cycles_in_sec; + + sem_wait(&clk_sem); + cycles_in_sec = get_cpu_mhz(0) * 1000000; + sem_post(&clk_sem); + + return cycles_in_sec; +} + +static inline cycles_t clk_get_cycles(void) +{ + cycles_t cycles; + + sem_wait(&clk_sem); + cycles = get_cycles(); + sem_post(&clk_sem); + + return cycles; +} + +static inline void clk_init(void) +{ + sem_init(&clk_sem, 0, 1); +} + +#define mmax(a, b) ((a) > (b) ? (a) : (b)) +#define mmin(a, b) ((a) < (b) ? (a) : (b)) + +static void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]) +{ + int i; + uint32_t *raw = (uint32_t *)gid->raw; + + for (i = 0; i < 4; ++i) + sprintf(&wgid[i * 8], "%08x", + htonl(raw[i])); +} + +static void wire_gid_to_gid(const char *wgid, const union ibv_gid *gid) +{ + char tmp[9]; + uint32_t v32; + uint32_t *raw = (uint32_t *)gid->raw; + int i; + + for (tmp[8] = 0, i = 0; i < 4; ++i) { + memcpy(tmp, wgid + i * 8, 8); + if (sscanf(tmp, "%x", &v32) != 1) + v32 = 0; + raw[i] = ntohl(v32); + } +} + +static int get_rand(int a, int b) +{ + return (lrand48() & 0xffffffff) % (b - a); +} + +static inline char *enum_to_cq_intf_str(enum cq_intf cq_intf) +{ + switch (cq_intf) { + case NORMAL_POLL_INTF: return "NORMAL_POLL_INTF"; + case ACC_POLL_CNT_INTF: return "ACC_POLL_CNT_INTF"; + case ACC_POLL_LENGTH_INTF: return "ACC_POLL_LENGTH_INTF"; + case ACC_POLL_LENGTH_INL_INTF: return "ACC_POLL_LENGTH_INL_INTF"; + default: return "ERR_INTF"; + } +} + +/* + * qp_poll - poll for send/receive completions using the different verbs interfaces + */ +static inline int qp_poll(struct cq_data *cq_data, const enum cq_intf cq_intf, struct intf_thread *thread) __attribute__((always_inline)); +static inline int qp_poll(struct cq_data *cq_data, const enum cq_intf cq_intf, struct intf_thread *thread) +{ + int consumed; + int i; + + switch (cq_intf) { + case NORMAL_POLL_INTF: + consumed = ibv_poll_cq(cq_data->cq, cq_data->wc_burst, cq_data->wc); + if (likely(consumed > 0)) { + for (i = 0; i < consumed; i++) { + if (cq_data->wc[i].status) { + fprintf(stderr, "poll_cq erroneous status %d\n", cq_data->wc[i].status); + + return -1; + } + } + } + break; + + case ACC_POLL_CNT_INTF: + consumed = cq_data->cq_family->poll_cnt(cq_data->cq, cq_data->wc_burst); + break; + + case ACC_POLL_LENGTH_INTF: + consumed = cq_data->cq_family->poll_length(cq_data->cq, NULL, NULL); + if (consumed > 0) + consumed = 1; + break; + + case ACC_POLL_LENGTH_INL_INTF: + consumed = cq_data->cq_family->poll_length(cq_data->cq, thread->inlr_buf, &thread->use_inlr); + if (consumed > 0) + consumed = 1; + break; + + default: + fprintf(stderr, "qp_poll - interface %d not supported\n", cq_intf); + consumed = -1; + } + + return consumed; +} + +/* + * qp_post - send/receive a burst of messages using the different verbs interfaces + */ +static inline int qp_post(struct qp_data *qp_data, struct ibv_sge *sg_list, int wr_idx, const enum qp_intf qp_intf) __attribute__((always_inline)); +static inline int qp_post(struct qp_data *qp_data, struct ibv_sge *sg_list, int wr_idx, const enum qp_intf qp_intf) +{ + int i; + int ret = 0; + struct ibv_recv_wr *bad_rwr; + struct ibv_send_wr *bad_swr; + + switch (qp_intf) { + case NORMAL_SEND_INTF: + ret = ibv_post_send(qp_data->qp, &qp_data->send_wr[wr_idx % qp_data->max_wrs], &bad_swr); + break; + + case NORMAL_RECV_INTF: + ret = ibv_post_recv(qp_data->qp, &qp_data->recv_wr[wr_idx % qp_data->max_wrs], &bad_rwr); + break; + + case ACC_SEND_PENDING_INTF: + for (i = 0; i < qp_data->wr_burst && !ret; i++) { + struct ibv_sge *sg_l = sg_list + i; + + ret = qp_data->qp_burst_family->send_pending(qp_data->qp, sg_l->addr, sg_l->length, sg_l->lkey, IBV_EXP_QP_BURST_SIGNALED); + } + if (!ret) + ret = qp_data->qp_burst_family->send_flush(qp_data->qp); + break; + + case ACC_SEND_PENDING_INL_INTF: + for (i = 0; i < qp_data->wr_burst && !ret; i++) { + struct ibv_sge *sg_l = sg_list + i; + + ret = qp_data->qp_burst_family->send_pending_inline(qp_data->qp, (void *)(uintptr_t)sg_l->addr, sg_l->length, IBV_EXP_QP_BURST_SIGNALED); + } + if (!ret) + ret = qp_data->qp_burst_family->send_flush(qp_data->qp); + break; + + case ACC_SEND_PENDING_SG_LIST_INTF: + for (i = 0; i < qp_data->wr_burst && !ret; i++) { + struct ibv_sge *sg_l = sg_list + i; + + ret = qp_data->qp_burst_family->send_pending_sg_list(qp_data->qp, sg_l, 1, IBV_EXP_QP_BURST_SIGNALED); + } + if (!ret) + ret = qp_data->qp_burst_family->send_flush(qp_data->qp); + break; + + case ACC_SEND_BURST_INTF: + ret = qp_data->qp_burst_family->send_burst(qp_data->qp, sg_list, qp_data->wr_burst, IBV_EXP_QP_BURST_SIGNALED); + break; + + case ACC_RECV_BURST_INTF: + ret = qp_data->qp_burst_family->recv_burst(qp_data->qp, sg_list, qp_data->wr_burst); + break; + } + + if (unlikely(ret)) { + fprintf(stderr, "ibv_post_send failed in interface = %d, err = %d\n", qp_intf, ret); + return -1; + } + + return qp_data->wr_burst; +} + +/* On each byte of the message put a nibble from the WR index + * and a nibble from the QP index + */ +static char calc_msg_data(int wr_idx, int qp_idx) +{ + char data = (char)((wr_idx & 0xF) | (qp_idx << 4)); + + return data; +} + +/* + * is_data_valid - Check received data + * To keep performance it checks only one random byte from the + * received (consumed) messages. + */ +static int is_data_valid(long *curr_poll_wr, int consumed, struct qp_cq_data *qp_cq_data, struct intf_thread *thread) +{ + char rand_data; + int rand_wr = (*curr_poll_wr + get_rand(0, consumed)) % qp_cq_data->qp.max_wrs; + int rand_idx = get_rand(0, qp_cq_data->qp.msg_size); + char send_data = calc_msg_data(rand_wr, qp_cq_data->idx); + + if (thread->use_inlr) + rand_data = thread->inlr_buf[rand_idx]; + else + rand_data = (qp_cq_data->qp.buf + rand_wr * qp_cq_data->qp.msg_stride)[rand_idx]; + if (rand_data != send_data) { + int wr, i; + + fprintf(stderr, "Received wrong data on thread = %d expected value = 0x%x actual value = 0x%x\n", + thread->thread_idx, send_data, rand_data); + fprintf(stderr, " use_inlr %d, curr_poll_wr %ld(0x%lx), consumed %d, rand_wr = %d, rand_idx = %d msg_size = %d\n", + thread->use_inlr, *curr_poll_wr, *curr_poll_wr, consumed, rand_wr, rand_idx, qp_cq_data->qp.msg_size); + for (wr = 0; wr < rand_wr + 2; wr++) { + int max_print = mmin(qp_cq_data->qp.msg_stride, 128); + + fprintf(stderr, "wr %d:", wr); + for (i = 0; i < max_print; i++) { + if (i == qp_cq_data->qp.msg_size) + fprintf(stderr, " |"); + fprintf(stderr, " %x", *(qp_cq_data->qp.buf + wr * qp_cq_data->qp.msg_stride + i)); + } + fprintf(stderr, "\n:"); + } + return 0; + } + thread->use_inlr = 0; + *curr_poll_wr += consumed; + + return 1; +} + +/* send_recv - is a function to send/receive messages using one QP/CQ set. + * + * While there are more messages to send/receive it uses the following logic: + * 1. Fill send/receive queue with burst of messages. + * 2. Poll completion queue until there is enough space in the QP to post + * additional burst (go back to 1). + */ +static inline int send_recv(struct qp_cq_data *qp_cq_data, + struct intf_thread *thread, const int check_data, + const enum qp_intf qp_intf, enum cq_intf cq_intf) +{ + int msg = 0; + int num_wrs; + int free_wrs; + int consumed; + long curr_poll_wr = 0; + struct ibv_sge *base_sg_list; + + num_wrs = qp_cq_data->qp.max_wrs; + base_sg_list = qp_cq_data->qp.sg_list; + + free_wrs = mmin(num_wrs, qp_cq_data->qp.num_msgs); + + while (msg < qp_cq_data->qp.num_msgs) { + /* Fill send/receive queue using bursts of messages*/ + while (free_wrs >= qp_cq_data->qp.wr_burst) { + if (qp_post(&qp_cq_data->qp, base_sg_list + (msg % num_wrs), msg, qp_intf) < 0) { + fprintf(stderr, "Post QP(%d) failed for thread %d\n", qp_intf, thread->thread_idx); + return 1; + } + msg += qp_cq_data->qp.wr_burst; + free_wrs -= qp_cq_data->qp.wr_burst; + } + + /* In order to put another burst of messages we need first to + * make sure there is enough space in the send/receive queue. + * Poll on the completion queue until we get the required space + */ + do { + consumed = qp_poll(&qp_cq_data->cq, cq_intf, thread); + if (likely(consumed > 0)) { + free_wrs += consumed; + if (unlikely(check_data)) + if (!is_data_valid(&curr_poll_wr, consumed, qp_cq_data, thread)) + return 1; + } else if (consumed < 0) { + fprintf(stderr, "Poll CQ(%s) failed for thread %d\n", enum_to_cq_intf_str(cq_intf), thread->thread_idx); + return 1; + } + } while (free_wrs < qp_cq_data->qp.wr_burst && msg < qp_cq_data->qp.num_msgs); + } + + return 0; +} + +static enum qp_intf send_2_qp[IN_NUM_SEND_INTF] = { + [IN_NORMAL_SEND_INTF] = NORMAL_SEND_INTF, + [IN_ACC_SEND_PENDING_INTF] = ACC_SEND_PENDING_INTF, + [IN_ACC_SEND_PENDING_INL_INTF] = ACC_SEND_PENDING_INL_INTF, + [IN_ACC_SEND_PENDING_SG_LIST_INTF] = ACC_SEND_PENDING_SG_LIST_INTF, + [IN_ACC_SEND_BURST_INTF] = ACC_SEND_BURST_INTF, +}; + +static enum qp_intf recv_2_qp[IN_NUM_POLL_INTF] = { + [IN_NORMAL_RECV_INTF] = NORMAL_RECV_INTF, + [IN_ACC_RECV_BURST_INTF] = ACC_RECV_BURST_INTF, +}; + +static enum cq_intf poll_2_cq[IN_NUM_POLL_INTF] = { + [IN_NORMAL_POLL_INTF] = NORMAL_POLL_INTF, + [IN_ACC_POLL_CNT_INTF] = ACC_POLL_CNT_INTF, + [IN_ACC_POLL_LENGTH_INTF] = ACC_POLL_LENGTH_INTF, +}; + +static int run_thread_on_cpu(int cpu, int thread_idx) { + int j; + cpu_set_t cpuset; + pthread_t pthread; + + pthread = pthread_self(); + + /* Set the selected cpu for the thread */ + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + + /* Force the thread to run on the selected cpu */ + if (pthread_setaffinity_np(pthread, sizeof(cpu_set_t), &cpuset)) + return 1; + + /* Make sure the thread is running on the selected cpu */ + if (pthread_getaffinity_np(pthread, sizeof(cpu_set_t), &cpuset)) { + fprintf(stderr, "Couldn't get thread(%d) affinity\n", + thread_idx); + } else { + for (j = 0; j < CPU_SETSIZE; j++) + if (CPU_ISSET(j, &cpuset) && (j != cpu)) + return 1; + } + + return 0; +} + + +/* __thread_wrap - is the main function of all application threads. + * The thread switch itself to the right cpu, waits for sync from main + * thread to start send/receive messages, executes send/receive messages + * and signals the main thread upon completion. + */ +static void *__thread_wrap(void *arg) +{ + cycles_t start; + struct intf_thread *thread = (struct intf_thread *)arg; + struct qp_cq_data *qp_cq = &thread->ctx->qps_cqs[thread->qp_idx]; + int check_data = thread->ctx->check_data; + + + /* Run thread on selected cpu */ + if (run_thread_on_cpu(thread->cpu, thread->thread_idx)) { + fprintf(stderr, "Couldn't run thread %d on cpu %d (errno = %d)\n", + thread->thread_idx, thread->cpu, errno); + goto thread_out; + } else { + printf("\tThread %d - Start on cpu %d\n", thread->thread_idx, thread->cpu); + } + + /* Get the cpu clk frequency */ + thread->cpu_freq = (unsigned long)clk_get_cpu_hz(0); + if (thread->cpu_freq == 0) + fprintf(stderr, "Can't get cpu(%d) frequency\n", thread->cpu); + + /* Wait for signal to start the send/receive process */ + sem_wait(&thread->ctx->threads_sem); + + qp_cq->qp.total_ms = INVALID_DURATION; + + start = clk_get_cycles(); + + /* Send/receive thread messages */ + if (send_recv(qp_cq, thread, check_data, qp_cq->qp.qp_intf, qp_cq->cq.cq_intf) || + !thread->cpu_freq) + /* Total exec time not valid if send_recv or cpu_freq failed */ + goto thread_out; + + /* calculate the total execution time in milli-seconds */ + qp_cq->qp.total_ms = ((clk_get_cycles() - start) * 1000) / + thread->cpu_freq; + +thread_out: + /* signal about thread completion */ + sem_post(&thread->ctx->threads_done_sem); + printf("\tThread %d - done\n", thread->thread_idx); + pthread_exit(NULL); +} + +static inline char *send_enum_to_verbs_intf_str(enum input_send_intf verbs_intf) +{ + switch (verbs_intf) { + case IN_NORMAL_SEND_INTF: return "S_NORM"; + case IN_ACC_SEND_PENDING_INTF: return "S_PEND"; + case IN_ACC_SEND_PENDING_INL_INTF: return "S_PEND_INL"; + case IN_ACC_SEND_PENDING_SG_LIST_INTF: return "S_PEND_SG_LIST"; + case IN_ACC_SEND_BURST_INTF: return "S_BURST"; + default: return "ERR_SEND_INTF"; + } +} + +static inline char *recv_enum_to_verbs_intf_str(enum input_recv_intf verbs_intf) +{ + switch (verbs_intf) { + case IN_NORMAL_RECV_INTF: return "R_NORM"; + case IN_ACC_RECV_BURST_INTF: return "R_BURST"; + default: return "ERR_RECV_INTF"; + } +} + +static inline char *poll_enum_to_verbs_intf_str(enum input_poll_intf verbs_intf) +{ + switch (verbs_intf) { + case IN_NORMAL_POLL_INTF: return "P_NORM"; + case IN_ACC_POLL_CNT_INTF: return "P_CNT"; + case IN_ACC_POLL_LENGTH_INTF: return "P_LEN"; + default: return "ERR_POLL_INTF"; + } +} + +static inline char *qp_intf_to_param_str(enum qp_intf verbs_intf) +{ + switch (verbs_intf) { + case NORMAL_SEND_INTF: return "S_NORM"; + case NORMAL_RECV_INTF: return "R_NORM"; + case ACC_SEND_PENDING_INTF: return "S_PEND"; + case ACC_SEND_PENDING_INL_INTF: return "S_PEND_INL"; + case ACC_SEND_PENDING_SG_LIST_INTF: return "S_PEND_SG_LIST"; + case ACC_SEND_BURST_INTF: return "S_BURST"; + case ACC_RECV_BURST_INTF: return "R_BURST"; + default: return "ERR_QP_INTF"; + } +} + +static inline char *cq_intf_to_param_str(enum cq_intf verbs_intf) +{ + switch (verbs_intf) { + case NORMAL_POLL_INTF: return "P_NORM"; + case ACC_POLL_CNT_INTF: return "P_CNT"; + case ACC_POLL_LENGTH_INTF: return "P_LEN"; + case ACC_POLL_LENGTH_INL_INTF: return "P_LEN"; + default: return "ERR_CQ_INTF"; + } +} + +static void print_qp_report(struct qp_cq_data *qp_cq_data, int send) +{ + + struct qp_data *qp_data = &qp_cq_data->qp; + char *post_s = qp_intf_to_param_str(qp_data->qp_intf); + char *poll_s = cq_intf_to_param_str(qp_cq_data->cq.cq_intf); + long mps; + + if (!qp_data->total_ms || qp_data->total_ms == INVALID_DURATION) { + if (qp_data->total_ms == INVALID_DURATION) + printf("\tTest execution aborted!\n"); + else + printf("\tTest execution time is too short to measure!\n"); + mps = 0; + } else { + mps = (qp_data->num_msgs * 1000) / qp_data->total_ms; + } + printf("\tmsg_size = %d, num_sge = 1, wr_burst = %d, intf = %s:%s, num_msgs = %'ld, time_ms = %'ld", + qp_data->msg_size, qp_data->wr_burst, post_s, poll_s, + qp_data->num_msgs, qp_data->total_ms); + if (mps) + printf(" mps = %'ld\n", mps); + else + printf("\n"); +} + +static void print_thread_report(struct intf_thread *thread) +{ + printf("Thread %d: CPU = %d MHz = %ld\n", + thread->thread_idx, thread->cpu, thread->cpu_freq/1000000); + + if (!thread->ctx->thread_stop) { + printf("\t%s QP %d data:\n", + thread->ctx->is_send ? "Send" : "Recv", + thread->qp_idx); + print_qp_report(&thread->ctx->qps_cqs[thread->qp_idx], + thread->ctx->is_send); + } +} + +static void print_global_report(struct intf_context *ctx) +{ + printf("Global test parameters: check_data = %d use_res_domain = %d\n", + ctx->check_data, ctx->use_res_domain); +} + +int run_threads(struct intf_context *ctx) +{ + int i, j; + int err; + pthread_t tid; + + sem_init(&ctx->threads_sem, 0, 0); + sem_init(&ctx->threads_done_sem, 0, 0); + clk_init(); + for (i = 0; i < ctx->num_threads; i++) { + ctx->threads[i].thread_idx = i; + ctx->threads[i].ctx = ctx; + err = pthread_create(&tid, NULL, __thread_wrap, &ctx->threads[i]); + if (err != 0) { + fprintf(stderr, "Can't create thread :[%s]", strerror(err)); + goto clean_threads; + } + } + + for (i = 0; i < ctx->num_threads; i++) + sem_post(&ctx->threads_sem); + + for (i = 0; i < ctx->num_threads; i++) + sem_wait(&ctx->threads_done_sem); + + print_global_report(ctx); + for (i = 0; i < ctx->num_threads; i++) + print_thread_report(&ctx->threads[i]); + + return 0; + +clean_threads: + + ctx->thread_stop = 1; + for (j = i ; j > 0; j--) + sem_post(&ctx->threads_sem); + + for (j = i ; j > 0; j--) + sem_wait(&ctx->threads_done_sem); + + return 1; +} + +static int connect_qp(struct ibv_qp *qp, int port, int my_psn, + enum ibv_mtu mtu, int sl, + union ibv_gid r_gid, int r_lid, int r_psn, int r_qpn, + int sgid_idx) +{ + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = mtu, + .dest_qp_num = r_qpn, + .rq_psn = r_psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = r_lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + } + }; + + if (r_gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = r_gid; + attr.ah_attr.grh.sgid_index = sgid_idx; + } + if (ibv_modify_qp(qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = my_psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp(qp, &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + + +static int create_cq(struct intf_context *ctx, + struct qp_cq_data *qps_cqs, + struct ibv_exp_cq_init_attr *init_attr, + struct ibv_exp_query_intf_params *intf_params) +{ + enum ibv_exp_query_intf_status intf_status; + + qps_cqs->cq.wc = calloc(1, sizeof(struct ibv_wc) * qps_cqs->cq.wc_burst); + + if (!qps_cqs->cq.wc) + return 1; + + init_attr->res_domain = ctx->threads[qps_cqs->idx].single_res_domain; + + qps_cqs->cq.cq = ibv_exp_create_cq(ctx->context, qps_cqs->cq.cq_size, + NULL, NULL, 0, init_attr); + if (!qps_cqs->cq.cq) { + fprintf(stderr, "Couldn't create CQ (errno = %d)\n", errno); + goto free_wc; + } + + intf_params->intf = IBV_EXP_INTF_CQ; + intf_params->obj = qps_cqs->cq.cq; + if (qps_cqs->cq.cq_intf != NORMAL_POLL_INTF) { + qps_cqs->cq.cq_family = ibv_exp_query_intf(ctx->context, intf_params, &intf_status); + if (!qps_cqs->cq.cq_family) { + fprintf(stderr, "Couldn't create CQ family (intf_status = %d)\n", intf_status); + goto destroy_cq; + } + } + + return 0; + +destroy_cq: + ibv_destroy_cq(qps_cqs->cq.cq); + +free_wc: + free(qps_cqs->cq.wc); + + return 1; +} + +static void destroy_cq(struct intf_context *ctx, + struct qp_cq_data *qps_cqs, + struct ibv_exp_release_intf_params *rel_intf) +{ + ibv_exp_release_intf(ctx->context, qps_cqs->cq.cq_family, rel_intf); + ibv_destroy_cq(qps_cqs->cq.cq); + free(qps_cqs->cq.wc); +} + +static int create_qp(struct intf_context *ctx, + struct qp_cq_data *qps_cqs, + struct ibv_qp_attr *attr, + struct ibv_exp_qp_init_attr *init_attr, + struct ibv_exp_query_intf_params *intf_params) +{ + enum ibv_exp_query_intf_status intf_status; + struct qp_data *qp = &qps_cqs->qp; + int max_wr; + void *tmp; + int j; + + init_attr->recv_cq = qps_cqs->cq.cq; + init_attr->send_cq = qps_cqs->cq.cq; + + init_attr->pd = ctx->pd, + init_attr->max_inl_recv = qp->max_inl_recv_data; + init_attr->cap.max_send_wr = qp->max_wrs, + init_attr->cap.max_recv_wr = qp->max_wrs, + init_attr->cap.max_send_sge = 1, + init_attr->cap.max_recv_sge = 1, + init_attr->cap.max_inline_data = qp->max_inline_data, + init_attr->qp_type = IBV_QPT_RC; + + qp->psn = lrand48() & 0xffffff; + + /* allocate WR, WC and sg list buffers */ + max_wr = qp->max_wrs; + qp->sg_list = calloc(1, max_wr * sizeof(struct ibv_sge)); + if (ctx->is_send) { + qp->send_wr = calloc(1, sizeof(struct ibv_send_wr) * max_wr); + tmp = qp->send_wr; + } else { + qp->recv_wr = calloc(1, sizeof(struct ibv_recv_wr) * max_wr); + tmp = qp->recv_wr; + } + if (!tmp || !qp->sg_list) { + fprintf(stderr, "Couldn't allocate WRs/WCs buffers\n"); + goto clean_qp; + } + + /* Create the QP */ + init_attr->res_domain = ctx->threads[qps_cqs->idx].single_res_domain; + init_attr->max_inl_recv = qp->max_inl_recv_data; + if (qp->max_inl_recv_data) + init_attr->comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; + else + init_attr->comp_mask &= ~IBV_EXP_QP_INIT_ATTR_INL_RECV; + + qp->qp = ibv_exp_create_qp(ctx->context, init_attr); + if (!qp->qp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_qp; + } + + /* Create messages buffer */ + qp->msg_stride = mmax(qp->msg_size, 64); + qp->buf = memalign(sysconf(_SC_PAGESIZE), max_wr * qp->msg_stride); + if (!qp->buf) { + fprintf(stderr, "Couldn't allocate recv/send buff for qp[%d]\n", qps_cqs->idx); + goto destroy_qp; + } + qp->mr = ibv_reg_mr(ctx->pd, qp->buf, max_wr * qp->msg_stride, + IBV_ACCESS_LOCAL_WRITE); + if (!qp->mr) { + fprintf(stderr, "Couldn't allocate recv/send MR for qp[%d]\n", qps_cqs->idx); + goto free_buf; + } + + /* Modify QP to INIT */ + if (ibv_modify_qp(qp->qp, attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto dereg_mr; + } + + + /* Prepare WRs and SG lists */ + for (j = 0; j < max_wr; j++) { + char *msg_buf = qp->buf + (j * qp->msg_stride); + char send_data = calc_msg_data(j, qps_cqs->idx); + int seg_size = qp->msg_size; + + if (ctx->is_send) + memset(msg_buf, send_data, qp->msg_size); + + qp->sg_list[j].addr = (uintptr_t)msg_buf; + qp->sg_list[j].length = seg_size; + qp->sg_list[j].lkey = qp->mr->lkey; + + msg_buf += seg_size; + if (ctx->is_send) { + /* For sender prepare pre-defined send_wr */ + struct ibv_send_wr *send_wr = &qp->send_wr[j]; + + if (j % qp->wr_burst != qp->wr_burst - 1) + send_wr->next = &qp->send_wr[j + 1]; + send_wr->num_sge = 1; + send_wr->opcode = IBV_WR_SEND; + send_wr->send_flags = IBV_SEND_SIGNALED; + send_wr->sg_list = &qp->sg_list[j]; + } else { + /* For receiver prepare pre-defined recv_wr */ + struct ibv_recv_wr *recv_wr = &qp->recv_wr[j]; + + if (j % qp->wr_burst != qp->wr_burst - 1) + recv_wr->next = &qp->recv_wr[j + 1]; + recv_wr->num_sge = 1; + recv_wr->sg_list = &qp->sg_list[j]; + } + } + + /* Query for QP burst-family if selected intf is not the normal one */ + intf_params->intf = IBV_EXP_INTF_QP_BURST; + intf_params->obj = qp->qp; + if (qp->qp_intf != NORMAL_RECV_INTF && qp->qp_intf != NORMAL_SEND_INTF) { + qp->qp_burst_family = ibv_exp_query_intf(ctx->context, intf_params, &intf_status); + if (!qp->qp_burst_family) { + fprintf(stderr, "Fail to query QP burst family (intf_status = %d)\n", intf_status); + goto dereg_mr; + } + } + + return 0; + +dereg_mr: + ibv_dereg_mr(qp->mr); + +free_buf: + free(qp->buf); + +destroy_qp: + ibv_destroy_qp(qp->qp); + +clean_qp: + if (!qp->send_wr) + free(qp->send_wr); + if (!qp->recv_wr) + free(qp->recv_wr); + free(qp->sg_list); + + return 1; +} + +static void destroy_qp(struct intf_context *ctx, + struct qp_cq_data *qps_cqs, + struct ibv_exp_release_intf_params *rel_intf) +{ + struct qp_data *qp = &qps_cqs->qp; + + if (qp->qp_burst_family) + ibv_exp_release_intf(ctx->context, qp->qp_burst_family, rel_intf); + + ibv_dereg_mr(qp->mr); + free(qp->buf); + ibv_destroy_qp(qp->qp); + if (!qp->send_wr) + free(qp->send_wr); + if (!qp->recv_wr) + free(qp->recv_wr); + if (!qp->sg_list) + free(qp->sg_list); +} + +int init_qps_cqs(struct intf_context *ctx) +{ + int i; + struct ibv_exp_qp_init_attr qp_init_attr; + struct ibv_exp_cq_init_attr cq_init_attr; + struct ibv_exp_query_intf_params intf_params; + struct ibv_exp_release_intf_params rel_intf_params; + struct ibv_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + memset(&cq_init_attr, 0, sizeof(cq_init_attr)); + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + memset(&intf_params, 0, sizeof(intf_params)); + memset(&rel_intf_params, 0, sizeof(rel_intf_params)); + + qp_init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD | + IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | + IBV_EXP_QP_INIT_ATTR_INL_RECV; + if (ctx->use_res_domain) { + cq_init_attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN; + qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_RES_DOMAIN; + } + + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.pkey_index = 0; + qp_attr.port_num = ctx->ib_port_num; + qp_attr.qp_access_flags = 0; + + intf_params.intf_scope = IBV_EXP_INTF_GLOBAL; + + for (i = 0; i < ctx->num_qps_cqs; i++) { + if (create_cq(ctx, &ctx->qps_cqs[i], &cq_init_attr, &intf_params)) + goto clean_qps_cqs; + + if (create_qp(ctx, &ctx->qps_cqs[i], &qp_attr, &qp_init_attr, &intf_params)) { + destroy_cq(ctx, &ctx->qps_cqs[i], &rel_intf_params); + goto clean_qps_cqs; + } + } + + return 0; + +clean_qps_cqs: + for (; i > 0; i--) { + destroy_cq(ctx, &ctx->qps_cqs[i], &rel_intf_params); + destroy_qp(ctx, &ctx->qps_cqs[i], &rel_intf_params); + } + + return 1; +} + +int destroy_qps_cqs(struct intf_context *ctx) +{ + struct ibv_exp_release_intf_params rel_intf_params; + int i; + + memset(&rel_intf_params, 0, sizeof(rel_intf_params)); + for (i = 0; i < ctx->num_qps_cqs; i++) { + destroy_cq(ctx, &ctx->qps_cqs[i], &rel_intf_params); + destroy_qp(ctx, &ctx->qps_cqs[i], &rel_intf_params); + } + + return 0; +} + +int init_res_domains(struct intf_context *ctx) +{ + struct ibv_exp_destroy_res_domain_attr dest_res_dom_attr; + struct ibv_exp_res_domain_init_attr res_domain_attr; + int i; + + res_domain_attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL | IBV_EXP_RES_DOMAIN_MSG_MODEL; + res_domain_attr.thread_model = IBV_EXP_THREAD_SINGLE; + res_domain_attr.msg_model = IBV_EXP_MSG_HIGH_BW; + + + /* Create resource domain per thread */ + for (i = 0; i < ctx->num_threads; i++) { + ctx->threads[i].single_res_domain = ibv_exp_create_res_domain(ctx->context, &res_domain_attr); + if (!ctx->threads[i].single_res_domain) { + fprintf(stderr, "Can't create resource domain for thread %d errno = %d\n", i, errno); + goto cleanup; + } + } + + return 0; + +cleanup: + dest_res_dom_attr.comp_mask = 0; + for (; i > 0; i--) + ibv_exp_destroy_res_domain(ctx->context, ctx->threads[i - 1].single_res_domain, &dest_res_dom_attr); + + return 1; +} + +int clean_res_domains(struct intf_context *ctx) +{ + struct ibv_exp_destroy_res_domain_attr dest_res_dom_attr; + int i; + + dest_res_dom_attr.comp_mask = 0; + + for (i = 0; i < ctx->num_threads; i++) + ibv_exp_destroy_res_domain(ctx->context, ctx->threads[i].single_res_domain, &dest_res_dom_attr); + + return 0; +} + +int create_resources(struct intf_context *ctx) +{ + struct ibv_device **dev_list; + int i; + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ctx->dev_name) { + ctx->ib_dev = *dev_list; + if (!ctx->ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ctx->dev_name)) { + ctx->ib_dev = dev_list[i]; + break; + } + if (!ctx->ib_dev) { + fprintf(stderr, "IB device %s not found\n", ctx->dev_name); + return 1; + } + } + + ctx->context = ibv_open_device(ctx->ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ctx->ib_dev)); + return 1; + } + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_device; + } + + if (ctx->use_res_domain) { + ctx->dattr.comp_mask = IBV_EXP_DEVICE_ATTR_CALC_CAP | + IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS | + IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ | + IBV_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS | + IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN; + + if (ibv_exp_query_device(ctx->context, &ctx->dattr)) { + fprintf(stderr, "Couldn't query device capabilities.\n"); + goto clean_pd; + } + if (!(ctx->dattr.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN)) { + fprintf(stderr, "query-device failed to retrieve max_ctx_res_domain\n"); + goto clean_pd; + } + if (ctx->num_threads > ctx->dattr.max_ctx_res_domain) { + fprintf(stderr, "can't allocate resource domain per thread, required=%d, available=%d\n", + ctx->num_threads, ctx->dattr.max_ctx_res_domain); + goto clean_pd; + } + if (init_res_domains(ctx)) + goto clean_pd; + } + + if (init_qps_cqs(ctx)) + goto clean_res_doms; + + return 0; + +clean_res_doms: + if (ctx->use_res_domain) + clean_res_domains(ctx); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_device: + ibv_close_device(ctx->context); + + return 1; +} +void destroy_resources(struct intf_context *ctx) +{ + destroy_qps_cqs(ctx); + if (ctx->use_res_domain) + clean_res_domains(ctx); + ibv_dealloc_pd(ctx->pd); + ctx->pd = NULL; + ibv_close_device(ctx->context); + ctx->context = NULL; +} + +/* defines the sender/receiver sides */ +static int is_send(struct intf_input *input) +{ + if (strlen(input->server_data.name) > 0) + return 0; + + return 1; +} + +#define PS_MAX_CPUS 128 +/* + * create_context - Allocate and initialize application database + * according to application input. + */ +int create_context(struct intf_context *ctx, struct intf_input *input) +{ + int cpu_idx; + int qp_idx; + int i, j; + int cpu_array[PS_MAX_CPUS]; + + /* Set context user name */ + if (strlen(input->server_data.name) > 0) + ctx->servername = input->server_data.name; + + ctx->is_send = is_send(input); + ctx->port = input->server_data.port; + + /* Define the number of application QPs/CQs (1 QP/CQ per threads) */ + ctx->num_qps_cqs = input->thread_prms.num_threads; + + /* Define the number of application CQs (num_qps) */ + ctx->check_data = input->ib_data.check_data && !ctx->is_send; + ctx->use_res_domain = input->ib_data.use_res_domain; + ctx->num_threads = input->thread_prms.num_threads; + cpu_idx = 0; + + /* create array of CPUs which application threads my run on */ + for (i = 0; i < input->thread_prms.num_cpu_sets; i++) + for (j = input->thread_prms.cpu_sets[i].min; j <= input->thread_prms.cpu_sets[i].max; j++) + if (cpu_idx < PS_MAX_CPUS) + cpu_array[cpu_idx++] = j; + else + fprintf(stderr, "Supporting up to %d cpus (ignoring some of requested cpus)\n", PS_MAX_CPUS); + + ctx->ib_port_num = input->ib_data.ib_port_num; + ctx->sl = input->ib_data.sl; + ctx->mtu = input->ib_data.mtu; + if (strlen(input->ib_data.dev_name) > 0 && strlen(input->ib_data.dev_name) < MAX_DEV_NAME_SIZE - 1) + strcpy(ctx->dev_name, input->ib_data.dev_name); + + /* Allocate qps, cqs and threads arrays */ + ctx->qps_cqs = calloc(1, ctx->num_qps_cqs * sizeof(*ctx->qps_cqs)); + if (!ctx->qps_cqs) + return 1; + + ctx->threads = calloc(1, ctx->num_threads * sizeof(*ctx->threads)); + if (!ctx->threads) + goto free_qps_cqs; + + /* Update QPs ad CQs data */ + for (i = 0; i < ctx->num_qps_cqs; i++) { + struct qp_cq_data *qp_cq_data = &ctx->qps_cqs[i]; + struct qp_data *qp_data = &qp_cq_data->qp; + + /* Update QP data */ + qp_cq_data->idx = i; + qp_data->max_inl_recv_data = input->qp_prms.max_inl_recv_data; + qp_data->msg_size = input->send_prms.msg_size; + qp_data->num_msgs = input->send_prms.num_qp_msgs; + qp_data->wr_burst = input->qp_prms.wr_burst; + qp_data->max_inline_data = 0; + + if (ctx->is_send) { + qp_data->qp_intf = send_2_qp[input->qp_prms.verbs_send_intf]; + qp_cq_data->cq.cq_intf = poll_2_cq[input->qp_prms.verbs_send_poll_intf]; + qp_data->max_wrs = input->qp_prms.max_send_wr; + + /* If SEND_PENDING_INL interface selected enable inline data */ + if (qp_data->qp_intf == ACC_SEND_PENDING_INL_INTF) + qp_data->max_inline_data = input->send_prms.msg_size; + } else { + qp_data->qp_intf = recv_2_qp[input->qp_prms.verbs_recv_intf]; + qp_cq_data->cq.cq_intf = poll_2_cq[input->qp_prms.verbs_recv_poll_intf]; + + /* If inline received supported chose the ACC_POLL_LENGTH_INL_INTF + * interface instead of ACC_POLL_LENGTH_INTF + */ + if (qp_cq_data->qp.max_inl_recv_data && + qp_cq_data->cq.cq_intf == ACC_POLL_LENGTH_INTF) + qp_cq_data->cq.cq_intf = ACC_POLL_LENGTH_INL_INTF; + qp_data->max_wrs = input->qp_prms.max_recv_wr; + } + qp_cq_data->cq.cq_size = qp_data->max_wrs; + qp_cq_data->cq.wc_burst = input->qp_prms.wr_burst; + } + + /* Update threads data */ + for (i = 0, qp_idx = 0; i < ctx->num_threads; i++, qp_idx ++) { + /* Update the range of QPs used by each thread */ + ctx->threads[i].qp_idx = qp_idx; + /* Update the thread cpu */ + ctx->threads[i].cpu = cpu_array[i % cpu_idx]; + } + + return 0; + +free_qps_cqs: + free(ctx->qps_cqs); + + return 1; +} + +void destroy_context(struct intf_context *ctx) +{ + free(ctx->qps_cqs); + free(ctx->threads); +} + +static int client_exch_dest(const char *servername, int port, + const struct ib_dest *my_dest, + struct ib_dest *rem_dest, int num_qps) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof("0000:00000000000000000000000000000000")]; + char qp_msg[sizeof("000000:000000")]; + int n; + int sockfd = -1; + char gid[33]; + int i; + + if (asprintf(&service, "%d", port) < 0) + return 1; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return 1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return 1; + } + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%s", my_dest->lid, gid); + if (write(sockfd, msg, sizeof(msg)) != sizeof(msg)) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + if (recv(sockfd, msg, sizeof(msg), MSG_WAITALL) != sizeof(msg)) { + perror("client read"); + fprintf(stderr, "Couldn't read remote address\n"); + goto out; + } + + if (sscanf(msg, "%x:%s", &rem_dest->lid, gid) != 2) + goto out; + wire_gid_to_gid(gid, &rem_dest->gid); + + for (i = 0; i < num_qps; i++) { + sprintf(qp_msg, "%06x:%06x", my_dest->qpn[i], my_dest->psn[i]); + if (write(sockfd, qp_msg, sizeof(qp_msg)) != sizeof(qp_msg)) { + fprintf(stderr, "Couldn't send local qp[%d] data\n", i); + goto out; + } + + if (recv(sockfd, qp_msg, sizeof(qp_msg), MSG_WAITALL) != sizeof(qp_msg)) { + perror("client read"); + fprintf(stderr, "Couldn't read remote qp[%d] data\n", i); + goto out; + } + if (sscanf(qp_msg, "%x:%x", &rem_dest->qpn[i], &rem_dest->psn[i]) != 2) + goto out; + } + + if (write(sockfd, "done", sizeof("done")) != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + goto out; + } + + close(sockfd); + + return 0; + + +out: + close(sockfd); + + return 1; +} + +static int server_exch_dest(struct intf_context *ctx, + int ib_port, enum ibv_mtu mtu, + int port, int sl, + const struct ib_dest *my_dest, + struct ib_dest *rem_dest, + int sgid_idx, int num_qps) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof("0000:00000000000000000000000000000000")]; + char qp_msg[sizeof("000000:000000")]; + int n; + int sockfd = -1, connfd; + char gid[33]; + int i; + + if (asprintf(&service, "%d", port) < 0) + return 1; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return 1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof(n)); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return 1; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, 0); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return 1; + } + + n = recv(connfd, msg, sizeof(msg), MSG_WAITALL); + if (n != sizeof(msg)) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof(msg)); + goto out; + } + + if (sscanf(msg, "%x:%s", &rem_dest->lid, gid) != 2) + goto out; + wire_gid_to_gid(gid, &rem_dest->gid); + + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%s", my_dest->lid, gid); + if (write(connfd, msg, sizeof(msg)) != sizeof(msg)) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + for (i = 0; i < num_qps; i++) { + if (recv(connfd, qp_msg, sizeof(qp_msg), MSG_WAITALL) != sizeof(qp_msg)) { + perror("client read"); + fprintf(stderr, "Couldn't read remote qp[%d] data\n", i); + goto out; + } + if (sscanf(qp_msg, "%x:%x", &rem_dest->qpn[i], &rem_dest->psn[i]) != 2) + goto out; + + sprintf(qp_msg, "%06x:%06x", my_dest->qpn[i], my_dest->psn[i]); + if (write(connfd, qp_msg, sizeof(qp_msg)) != sizeof(qp_msg)) { + fprintf(stderr, "Couldn't send local qp[%d] data\n", i); + goto out; + } + } + + + /* expecting "done" msg */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + goto out; + } + + close(connfd); + + return 0; + +out: + close(connfd); + + return 1; +} + + +int exchange_remote_data(struct intf_context *ctx) +{ + int i; + struct ib_dest my_dest; + struct ib_dest rem_dest; + char gid[INET6_ADDRSTRLEN]; + struct ibv_port_attr portinfo; + + if (ibv_query_port(ctx->context, ctx->ib_port_num, &portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + + if (portinfo.link_layer != IBV_LINK_LAYER_INFINIBAND) { + fprintf(stderr, "link_layer != IBV_LINK_LAYER_INFINIBAND\n"); + return 1; + } + + my_dest.lid = portinfo.lid; + if (!my_dest.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + memset(&my_dest.gid, 0, sizeof(my_dest.gid)); + + my_dest.psn = calloc(1, sizeof(*my_dest.psn) * ctx->num_qps_cqs); + my_dest.qpn = calloc(1, sizeof(*my_dest.qpn) * ctx->num_qps_cqs); + rem_dest.psn = calloc(1, sizeof(*my_dest.psn) * ctx->num_qps_cqs); + rem_dest.qpn = calloc(1, sizeof(*my_dest.qpn) * ctx->num_qps_cqs); + + if (!my_dest.psn || !my_dest.qpn || !rem_dest.psn || !rem_dest.qpn) + goto free_buffs; + + for (i = 0; i < ctx->num_qps_cqs; i++) { + my_dest.qpn[i] = ctx->qps_cqs[i].qp.qp->qp_num; + my_dest.psn[i] = ctx->qps_cqs[i].qp.psn; + } + inet_ntop(AF_INET6, &my_dest.gid, gid, sizeof(gid)); + printf(" local address: LID 0x%04x, GID %s\n", my_dest.lid, gid); + + if (ctx->servername) { + if (client_exch_dest(ctx->servername, ctx->port, + &my_dest, &rem_dest, ctx->num_qps_cqs)) { + fprintf(stderr, "Couldn't get remote LID\n"); + goto free_buffs; + } + } else { + if (server_exch_dest(ctx, ctx->ib_port_num, ctx->mtu, ctx->port, ctx->sl, + &my_dest, &rem_dest, 0, ctx->num_qps_cqs)) { + fprintf(stderr, "Couldn't get remote LID\n"); + goto free_buffs; + } + } + + for (i = 0; i < ctx->num_qps_cqs; i++) { + ctx->qps_cqs[i].qp.remote_qpn = rem_dest.qpn[i]; + if (connect_qp(ctx->qps_cqs[i].qp.qp, ctx->ib_port_num, my_dest.psn[i], ctx->mtu, ctx->sl, + rem_dest.gid, rem_dest.lid, rem_dest.psn[i], + rem_dest.qpn[i], 0)) { + fprintf(stderr, "Couldn't connect to remote qp[%d]\n", i); + goto free_buffs; + } + } + + return 0; + +free_buffs: + if (my_dest.psn) + free(my_dest.psn); + if (my_dest.qpn) + free(my_dest.qpn); + if (rem_dest.psn) + free(rem_dest.psn); + if (rem_dest.qpn) + free(rem_dest.qpn); + + return 1; +} + +static inline enum ibv_mtu mtu_to_enum(int mtu) +{ + switch (mtu) { + case 256: return IBV_MTU_256; + case 512: return IBV_MTU_512; + case 1024: return IBV_MTU_1024; + case 2048: return IBV_MTU_2048; + case 4096: return IBV_MTU_4096; + default: return -1; + } +} + +static inline int enum_to_mtu(enum ibv_mtu mtu) +{ + switch (mtu) { + case IBV_MTU_256: return 256; + case IBV_MTU_512: return 512; + case IBV_MTU_1024: return 1024; + case IBV_MTU_2048: return 2048; + case IBV_MTU_4096: return 4096; + default: return -1; + } +} + +static void usage(const char *argv0, struct intf_input *default_input) +{ + int i; + + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s connect to server at \n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port (default %d)\n", default_input->server_data.port); + printf(" -d, --ib-dev= use IB device (default %s)\n", default_input->ib_data.dev_name); + printf(" -i, --ib-port= use port of IB device (default %d)\n", default_input->ib_data.ib_port_num); + printf(" -s, --size= size of message (default %'d max size %'d)\n", default_input->send_prms.msg_size, MAX_MSG_SIZE); + printf(" -m, --mtu= path MTU (default %'d)\n", enum_to_mtu(default_input->ib_data.mtu)); + printf(" -r, --rx-depth= receive queue size (default %'d)\n", default_input->qp_prms.max_recv_wr); + printf(" -n, --iters= number of messages (default %'d)\n", default_input->send_prms.num_qp_msgs); + printf(" -l, --sl= service level value (default %d)\n", default_input->ib_data.sl); + printf(" -t, --inline-recv= size of inline-recv (default %d)\n", default_input->qp_prms.max_inl_recv_data); + printf(" -S, --send-verb= send verb interface to use S_NORM/S_PEND/S_PEND_INL/S_PEND_SG_LIST/S_BURST (default %s)\n", + send_enum_to_verbs_intf_str(default_input->qp_prms.verbs_send_intf)); + printf(" -R, --recv-verb= recv verb interface to use R_NORM/R_BURST (default %s)\n", + recv_enum_to_verbs_intf_str(default_input->qp_prms.verbs_recv_intf)); + printf(" -P, --poll-verb= poll verb interface to use P_NORM/P_CNT/P_LEN (default send: %s recv: %s)\n", + poll_enum_to_verbs_intf_str(default_input->qp_prms.verbs_send_poll_intf), + poll_enum_to_verbs_intf_str(default_input->qp_prms.verbs_recv_poll_intf)); + printf(" -c, --cpus-list= CPUs list to run on (default "); + for (i = 0; i < default_input->thread_prms.num_cpu_sets; i++) { + printf("[%d..%d]", default_input->thread_prms.cpu_sets[i].min, default_input->thread_prms.cpu_sets[i].max); + if (i + 1 == default_input->thread_prms.num_cpu_sets) + printf(")\n"); + else + printf(","); + } + printf(" -b, --burst= size of send/recv wr burst (default %'d)\n", default_input->qp_prms.wr_burst); + printf(" -T, --num-threads= Number of threads to run (default %'d)\n", default_input->thread_prms.num_threads); + printf(" -C, --check-data check the data received (default no-checks)\n"); + printf(" -A, --avoid-res-domain avoid usage of resource domain (default use res-domain)\n"); +} + +int str_to_cpu_set(char *str, int *num_cpus, struct cpu_set *cpu_sets) +{ + char *p; + char *t; + int min, max; + char stmp[64]; + int ncpus = 0; + + if (strlen(str) >= 64) + return 1; + + strcpy(stmp, str); + p = stmp; + + while (ncpus < MAX_CPU_SETS && p && strlen(p)) { + t = strchr(p, ']'); + if (t) { + t++; + if (strlen(t)) { + *t = 0; + t++; + } + } + + if (sscanf(p, "[%d..%d]", &min, &max) != 2) + return 1; + p = t; + + if (min > max) + return 1; + + cpu_sets[ncpus].min = min; + cpu_sets[ncpus].max = max; + ncpus++; + } + + if (!ncpus) + return 1; + + *num_cpus = ncpus; + + return 0; +} + +/* + * parse_input - Create input data for the test based on the + * default_input and application parameters + */ +int parse_input(struct intf_input *input, struct intf_input *default_input, int argc, char *argv[]) +{ + int tmp; + enum ibv_mtu mtu; + char *ib_devname = NULL; + char *vrbs_intf = NULL; + char *cpus_str = NULL; + unsigned long long size; + + memcpy(input, default_input, sizeof(*input)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "inline-recv", .has_arg = 1, .val = 't' }, + { .name = "send-verb", .has_arg = 1, .val = 'S' }, + { .name = "recv-verb", .has_arg = 1, .val = 'R' }, + { .name = "poll-verb", .has_arg = 1, .val = 'P' }, + { .name = "cpus-list", .has_arg = 1, .val = 'c' }, + { .name = "burst", .has_arg = 1, .val = 'b' }, + { .name = "num-threads", .has_arg = 1, .val = 'T' }, + { .name = "check-data", .has_arg = 0, .val = 'C' }, + { .name = "avoid-res-domain", .has_arg = 0, .val = 'A' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:t:c:S:R:P:b:T:CA", + long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + tmp = strtol(optarg, NULL, 0); + if (tmp < 0 || tmp > 65535) + goto print_usage; + input->server_data.port = tmp; + + break; + + case 'd': + ib_devname = strdupa(optarg); + if (strlen(ib_devname) >= MAX_DEV_NAME_SIZE - 1) { + fprintf(stderr, "Device name too long (max %d)\n", MAX_DEV_NAME_SIZE - 1); + goto print_usage; + } + + strcpy(input->ib_data.dev_name, ib_devname); + break; + + case 'i': + tmp = strtol(optarg, NULL, 0); + if (tmp < 0) + goto print_usage; + input->ib_data.ib_port_num = tmp; + break; + + case 's': + size = strtoll(optarg, NULL, 0); + + if (size < 0 || size > MAX_MSG_SIZE) + goto print_usage; + + input->send_prms.msg_size = size; + break; + + case 'm': + mtu = mtu_to_enum(strtol(optarg, NULL, 0)); + if (mtu < 0) + goto print_usage; + input->ib_data.mtu = mtu; + break; + + case 'r': + tmp = strtol(optarg, NULL, 0); + input->qp_prms.max_recv_wr = tmp; + break; + + case 'n': + tmp = strtol(optarg, NULL, 0); + input->send_prms.num_qp_msgs = tmp; + break; + + case 'l': + tmp = strtol(optarg, NULL, 0); + input->ib_data.sl = tmp; + break; + + case 't': + tmp = strtol(optarg, NULL, 0); + input->qp_prms.max_inl_recv_data = tmp; + if (input->qp_prms.max_inl_recv_data > MAX_INLINE_RECV) { + fprintf(stderr, "Max allowed inline-recv = %d\n", MAX_INLINE_RECV); + goto print_usage; + } + break; + + case 'S': + vrbs_intf = strdupa(optarg); + if (!strcmp(vrbs_intf, "S_NORM")) { + input->qp_prms.verbs_send_intf = IN_NORMAL_SEND_INTF; + } else if (!strcmp(vrbs_intf, "S_PEND")) { + input->qp_prms.verbs_send_intf = IN_ACC_SEND_PENDING_INTF; + } else if (!strcmp(vrbs_intf, "S_PEND_INL")) { + input->qp_prms.verbs_send_intf = IN_ACC_SEND_PENDING_INL_INTF; + } else if (!strcmp(vrbs_intf, "S_PEND_SG_LIST")) { + input->qp_prms.verbs_send_intf = IN_ACC_SEND_PENDING_SG_LIST_INTF; + } else if (!strcmp(vrbs_intf, "S_BURST")) { + input->qp_prms.verbs_send_intf = IN_ACC_SEND_BURST_INTF; + } else { + fprintf(stderr, "Send interface name %s not supported\n", vrbs_intf); + goto print_usage; + } + break; + + case 'R': + vrbs_intf = strdupa(optarg); + if (!strcmp(vrbs_intf, "R_NORM")) { + input->qp_prms.verbs_recv_intf = IN_NORMAL_RECV_INTF; + } else if (!strcmp(vrbs_intf, "R_BURST")) { + input->qp_prms.verbs_recv_intf = IN_ACC_RECV_BURST_INTF; + } else { + fprintf(stderr, "Receive interface name %s not supported\n", vrbs_intf); + goto print_usage; + } + break; + + case 'P': + vrbs_intf = strdupa(optarg); + if (!strcmp(vrbs_intf, "P_NORM")) { + input->qp_prms.verbs_recv_poll_intf = IN_NORMAL_POLL_INTF; + input->qp_prms.verbs_send_poll_intf = IN_NORMAL_POLL_INTF; + } else if (!strcmp(vrbs_intf, "P_CNT")) { + input->qp_prms.verbs_recv_poll_intf = IN_ACC_POLL_CNT_INTF; + input->qp_prms.verbs_send_poll_intf = IN_ACC_POLL_CNT_INTF; + } else if (!strcmp(vrbs_intf, "P_LEN")) { + input->qp_prms.verbs_recv_poll_intf = IN_ACC_POLL_LENGTH_INTF; + input->qp_prms.verbs_send_poll_intf = IN_ACC_POLL_LENGTH_INTF; + } else { + fprintf(stderr, "Poll interface name %s not supported\n", vrbs_intf); + goto print_usage; + } + break; + + case 'c': + cpus_str = strdupa(optarg); + if (str_to_cpu_set(cpus_str, &input->thread_prms.num_cpu_sets, input->thread_prms.cpu_sets)) { + fprintf(stderr, "Wrong cpus list: %s\n", cpus_str); + goto print_usage; + } + break; + + case 'b': + tmp = strtol(optarg, NULL, 0); + if (tmp < 0 || tmp > 65535) + goto print_usage; + + input->qp_prms.wr_burst = tmp; + break; + + case 'T': + tmp = strtol(optarg, NULL, 0); + if (tmp < 0 || tmp > 65535) + goto print_usage; + + input->thread_prms.num_threads = tmp; + break; + + case 'C': + input->ib_data.check_data = 1; + break; + + case 'A': + input->ib_data.use_res_domain = 0; + break; + + default: + goto print_usage; + } + } + + if (optind == argc - 1) { + if (strlen(argv[optind]) > 0 && strlen(argv[optind]) < MAX_SERVER_NAME_SIZE - 1) + strcpy(input->server_data.name, argv[optind]); + } else if (optind < argc) { + goto print_usage; + } + + if (is_send(input) && input->qp_prms.wr_burst * 2 >= input->qp_prms.max_send_wr) { + fprintf(stderr, "Invalid input, max_send_wr(%d) should be at least twice the size of burst size(%d)\n", + input->qp_prms.max_send_wr, input->qp_prms.wr_burst); + return 1; + } + + if (!is_send(input) && input->qp_prms.wr_burst * 2 >= input->qp_prms.max_recv_wr) { + fprintf(stderr, "Invalid input, max_recv_wr(%d) should be at least twice the size of burst size(%d)\n", + input->qp_prms.max_recv_wr, input->qp_prms.wr_burst); + return 1; + } + + if (is_send(input) && input->qp_prms.max_send_wr % input->qp_prms.wr_burst) { + fprintf(stderr, "Invalid input modulo(max_send_wr(%d), burst size(%d)) != 0\n", + input->qp_prms.max_send_wr, input->qp_prms.wr_burst); + return 1; + } + + if (!is_send(input) && input->qp_prms.max_recv_wr % input->qp_prms.wr_burst) { + fprintf(stderr, "Invalid input modulo(max_recv_wr(%d), burst size(%d)) != 0\n", + input->qp_prms.max_recv_wr, input->qp_prms.wr_burst); + return 1; + } + + /* We can't use IN_ACC_POLL_LENGTH_INTF to poll for send messages completion */ + if (is_send(input) && input->qp_prms.verbs_send_poll_intf == IN_ACC_POLL_LENGTH_INTF) { + fprintf(stderr, "It is not allowed to use poll-length(P_LEN) for send messages.\n"); + fprintf(stderr, "Use -P P_CNT or -P P_NORM options to poll for send messages completion\n"); + return 1; + } + + return 0; + +print_usage: + usage(argv[0], default_input); + return 1; +} + +int main(int argc, char *argv[]) +{ + struct intf_context *ctx; + int ret = 0; + + setlocale(LC_NUMERIC, ""); + srand48(getpid() * time(NULL)); + + if (parse_input(&intf_input, &intf_default_input, argc, argv)) { + fprintf(stderr, "Failed to update and validate test inputs\n"); + return 1; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return 1; + + if (create_context(ctx, &intf_input)) { + fprintf(stderr, "Failed to create test context\n"); + ret = 1; + goto free_ctx; + } + + if (create_resources(ctx)) { + fprintf(stderr, "Failed to create test resources\n"); + ret = 1; + goto destroy_context; + } + + if (exchange_remote_data(ctx)) { + fprintf(stderr, "Failed to create test context and resources\n"); + ret = 1; + goto destroy_resources; + } + + if (run_threads(ctx)) { + fprintf(stderr, "Failed in test execution\n"); + ret = 1; + goto destroy_resources; + } + +destroy_resources: + destroy_resources(ctx); + +destroy_context: + destroy_context(ctx); + +free_ctx: + free(ctx); + + return ret; +} Index: contrib/ofed/libibverbs/examples/pingpong.h =================================================================== --- contrib/ofed/libibverbs/examples/pingpong.h +++ contrib/ofed/libibverbs/examples/pingpong.h @@ -33,9 +33,8 @@ #ifndef IBV_PINGPONG_H #define IBV_PINGPONG_H -#include - #include +#include enum ibv_mtu pp_mtu_to_enum(int mtu); uint16_t pp_get_local_lid(struct ibv_context *context, int port); Index: contrib/ofed/libibverbs/examples/pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/pingpong.c +++ contrib/ofed/libibverbs/examples/pingpong.c @@ -68,19 +68,22 @@ { char tmp[9]; uint32_t v32; + uint32_t *raw = (uint32_t *)gid->raw; int i; for (tmp[8] = 0, i = 0; i < 4; ++i) { memcpy(tmp, wgid + i * 8, 8); sscanf(tmp, "%x", &v32); - *(uint32_t *)(&gid->raw[i * 4]) = ntohl(v32); + raw[i] = ntohl(v32); } } void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]) { int i; + uint32_t *raw = (uint32_t *)gid->raw; for (i = 0; i < 4; ++i) - sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(gid->raw + i * 4))); + sprintf(&wgid[i * 8], "%08x", + htonl(raw[i])); } Index: contrib/ofed/libibverbs/examples/polldcinfo.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/polldcinfo.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dc.h" + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -i, --ib-port= use port of IB device (default 1)\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + char *ib_devname = NULL; + int ib_port = 1; + struct ibv_context *ctx; + int n; + struct ibv_exp_dc_info_ent ents[2]; + int i; + int j; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { 0 } + }; + + c = getopt_long(argc, argv, "d:i:", + long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind < argc) { + usage(argv[0]); + return 1; + } + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = ibv_open_device(ib_dev); + if (!ctx) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + return 1; + } + + while (1) { + n = ibv_exp_poll_dc_info(ctx, ents, 2, ib_port); + if (n < 0) { + printf("error polling dc info, aborting\n"); + return -1; + } + for (i = 0; i < n; i++) { + printf("=== sqeuence number 0x%08x\n", ents[i].seqnum); + for (j = 0; j < 30; j++) + printf("lid[%d] = 0x%04x\n", j, ents[i].lid[j]); + } + } + + return 0; +} Index: contrib/ofed/libibverbs/examples/rc_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/rc_pingpong.c +++ contrib/ofed/libibverbs/examples/rc_pingpong.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -55,6 +54,10 @@ }; static int page_size; +static int use_contiguous_mr; +static int use_odp; +static void *contig_addr; +static int family = AF_INET; struct pingpong_context { struct ibv_context *context; @@ -64,10 +67,11 @@ struct ibv_cq *cq; struct ibv_qp *qp; void *buf; - int size; + unsigned long long size; int rx_depth; int pending; - struct ibv_port_attr portinfo; + struct ibv_port_attr portinfo; + int inlr_recv; }; struct pingpong_dest { @@ -140,7 +144,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -162,6 +166,8 @@ } for (t = res; t; t = t->ai_next) { + if (t->ai_family != family) + continue; sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) @@ -180,25 +186,30 @@ } gid_to_wire_gid(&my_dest->gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, my_dest->psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); if (write(sockfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); goto out; } - if (read(sockfd, msg, sizeof msg) != sizeof msg) { + if (recv(sockfd, msg, sizeof(msg), MSG_WAITALL) != sizeof(msg)) { perror("client read"); fprintf(stderr, "Couldn't read remote address\n"); goto out; } - write(sockfd, "done", sizeof "done"); + if (write(sockfd, "done", sizeof("done")) != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + goto out; + } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; - sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); wire_gid_to_gid(gid, &rem_dest->gid); out: @@ -215,7 +226,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -237,6 +248,8 @@ } for (t = res; t; t = t->ai_next) { + if (t->ai_family != family) + continue; sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; @@ -266,7 +279,7 @@ return NULL; } - n = read(connfd, msg, sizeof msg); + n = recv(connfd, msg, sizeof(msg), MSG_WAITALL); if (n != sizeof msg) { perror("server read"); fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); @@ -277,10 +290,12 @@ if (!rem_dest) goto out; - sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); wire_gid_to_gid(gid, &rem_dest->gid); - if (pp_connect_ctx(ctx, ib_port, my_dest->psn, mtu, sl, rem_dest, sgid_idx)) { + if (pp_connect_ctx(ctx, ib_port, my_dest->psn, mtu, sl, rem_dest, + sgid_idx)) { fprintf(stderr, "Couldn't connect to remote QP\n"); free(rem_dest); rem_dest = NULL; @@ -289,7 +304,8 @@ gid_to_wire_gid(&my_dest->gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, my_dest->psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); if (write(connfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); @@ -297,48 +313,70 @@ goto out; } - read(connfd, msg, sizeof msg); + /* expecting "done" msg */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } out: close(connfd); return rem_dest; } -#include - -static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, +static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, unsigned long long size, int rx_depth, int port, - int use_event, int is_server) + int use_event, int inlr_recv) { struct pingpong_context *ctx; + struct ibv_exp_device_attr dattr; + int ret; ctx = calloc(1, sizeof *ctx); if (!ctx) return NULL; + memset(&dattr, 0, sizeof(dattr)); + ctx->size = size; ctx->rx_depth = rx_depth; - ctx->buf = malloc(roundup(size, page_size)); - if (!ctx->buf) { - fprintf(stderr, "Couldn't allocate work buf.\n"); - return NULL; + if (!use_contiguous_mr) { + ctx->buf = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } } - memset(ctx->buf, 0x7b + is_server, size); - ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); - return NULL; + goto clean_buffer; } + if (inlr_recv) { + dattr.comp_mask |= IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ; + ret = ibv_exp_query_device(ctx->context, &dattr); + if (ret) { + printf(" Couldn't query device for inline-receive capabilities.\n"); + } else if (!(dattr.comp_mask & IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ)) { + printf(" Inline-receive not supported by driver.\n"); + } else if (dattr.inline_recv_sz < inlr_recv) { + printf(" Max inline-receive(%d) < Requested inline-receive(%d).\n", + dattr.inline_recv_sz, inlr_recv); + } + } + ctx->inlr_recv = inlr_recv; + if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); - return NULL; + goto clean_device; } } else ctx->channel = NULL; @@ -346,24 +384,78 @@ ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); - return NULL; + goto clean_comp_channel; } - ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size, IBV_ACCESS_LOCAL_WRITE); + if (!use_contiguous_mr && !use_odp) { + ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size, + IBV_ACCESS_LOCAL_WRITE); + } else if (use_odp) { + struct ibv_exp_reg_mr_in in; + in.pd = ctx->pd; + in.addr = ctx->buf; + in.length = size; + in.exp_access = IBV_EXP_ACCESS_LOCAL_WRITE | IBV_EXP_ACCESS_ON_DEMAND; + in.comp_mask = 0; + dattr.comp_mask |= IBV_EXP_DEVICE_ATTR_ODP; + ret = ibv_exp_query_device(ctx->context, &dattr); + if (ret) { + printf(" Couldn't query device for on-demand\ + paging capabilities.\n"); + goto clean_pd; + } else if (!(dattr.comp_mask & IBV_EXP_DEVICE_ATTR_ODP)) { + printf(" On-demand paging not supported by driver.\n"); + goto clean_pd; + } else if (!(dattr.odp_caps.per_transport_caps.rc_odp_caps & + IBV_EXP_ODP_SUPPORT_SEND)) { + printf(" Send is not supported for RC transport.\n"); + goto clean_pd; + } else if (!(dattr.odp_caps.per_transport_caps.rc_odp_caps & + IBV_EXP_ODP_SUPPORT_RECV)) { + printf(" Receive is not supported for RC transport.\n"); + goto clean_pd; + } + + ctx->mr = ibv_exp_reg_mr(&in); + } else { + struct ibv_exp_reg_mr_in in; + + in.pd = ctx->pd; + in.addr = contig_addr; + in.length = size; + in.exp_access = IBV_EXP_ACCESS_LOCAL_WRITE; + if (contig_addr) { + in.comp_mask = IBV_EXP_REG_MR_CREATE_FLAGS; + in.create_flags = IBV_EXP_REG_MR_CREATE_CONTIG; + } else { + in.comp_mask = 0; + in.exp_access |= IBV_EXP_ACCESS_ALLOCATE_MR; + } + + ctx->mr = ibv_exp_reg_mr(&in); + } + + if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); - return NULL; + goto clean_pd; } + + if (use_contiguous_mr) + ctx->buf = ctx->mr->addr; + + /* FIXME memset(ctx->buf, 0, size); */ + memset(ctx->buf, 0x7b, size); ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); - return NULL; + goto clean_mr; } { - struct ibv_qp_init_attr attr = { + struct ibv_exp_qp_init_attr attr = { .send_cq = ctx->cq, .recv_cq = ctx->cq, .cap = { @@ -372,14 +464,23 @@ .max_send_sge = 1, .max_recv_sge = 1 }, - .qp_type = IBV_QPT_RC + .qp_type = IBV_QPT_RC, + .pd = ctx->pd, + .comp_mask = IBV_EXP_QP_INIT_ATTR_PD, + .max_inl_recv = ctx->inlr_recv }; + if (ctx->inlr_recv) + attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; + + ctx->qp = ibv_exp_create_qp(ctx->context, &attr); - ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); - return NULL; + goto clean_cq; } + if (ctx->inlr_recv > attr.max_inl_recv) + printf(" Actual inline-receive(%d) < requested inline-receive(%d)\n", + attr.max_inl_recv, ctx->inlr_recv); } { @@ -396,11 +497,39 @@ IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); - return NULL; + goto clean_qp; } } return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + if (!use_contiguous_mr) + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; } int pp_close_ctx(struct pingpong_context *ctx) @@ -437,17 +566,22 @@ return 1; } - free(ctx->buf); + if (!use_contiguous_mr) + free(ctx->buf); + free(ctx); return 0; } +#define mmin(a, b) a < b ? a : b +#define MAX_SGE_LEN 0xFFFFFFF + static int pp_post_recv(struct pingpong_context *ctx, int n) { struct ibv_sge list = { .addr = (uintptr_t) ctx->buf, - .length = ctx->size, + .length = mmin(ctx->size, MAX_SGE_LEN), .lkey = ctx->mr->lkey }; struct ibv_recv_wr wr = { @@ -469,7 +603,7 @@ { struct ibv_sge list = { .addr = (uintptr_t) ctx->buf, - .length = ctx->size, + .length = mmin(ctx->size, MAX_SGE_LEN), .lkey = ctx->mr->lkey }; struct ibv_send_wr wr = { @@ -491,16 +625,59 @@ printf(" %s connect to server at \n", argv0); printf("\n"); printf("Options:\n"); - printf(" -p, --port= listen on/connect to port (default 18515)\n"); - printf(" -d, --ib-dev= use IB device (default first device found)\n"); - printf(" -i, --ib-port= use port of IB device (default 1)\n"); - printf(" -s, --size= size of message to exchange (default 4096)\n"); - printf(" -m, --mtu= path MTU (default 1024)\n"); - printf(" -r, --rx-depth= number of receives to post at a time (default 500)\n"); - printf(" -n, --iters= number of exchanges (default 1000)\n"); - printf(" -l, --sl= service level value\n"); - printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -p, --port= listen on/connect to port (default 18515)\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -i, --ib-port= use port of IB device (default 1)\n"); + printf(" -s, --size= size of message to exchange (default 4096)\n"); + printf(" -m, --mtu= path MTU (default 1024)\n"); + printf(" -r, --rx-depth= number of receives to post at a time (default 500)\n"); + printf(" -n, --iters= number of exchanges (default 1000)\n"); + printf(" -l, --sl= service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -g, --gid-idx= local port gid index\n"); + printf(" -c, --contiguous-mr use contiguous mr\n"); + printf(" -t, --inline-recv= size of inline-recv\n"); + printf(" -a, --check-nop check NOP opcode\n"); + printf(" -o, --odp use on demand paging\n"); + printf(" -z, --contig_addr use specifix addr for contig pages MR, must use with -c flag\n"); + printf(" -6, --ipv6 use IPv6\n"); +} + +int send_nop(struct pingpong_context *ctx) +{ + struct ibv_exp_send_wr *bad_wr; + struct ibv_exp_send_wr wr; + struct ibv_exp_wc wc; + int err; + int n; + + memset(&wr, 0, sizeof(wr)); + + wr.wr_id = PINGPONG_SEND_WRID; + wr.num_sge = 0; + wr.exp_opcode = IBV_EXP_WR_NOP; + wr.exp_send_flags = IBV_EXP_SEND_SIGNALED; + + err = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); + if (err) { + fprintf(stderr, "post nop failed\n"); + return err; + } + + do { + n = ibv_exp_poll_cq(ctx->cq, 1, &wc, sizeof(wc)); + if (n < 0) { + fprintf(stderr, "poll CQ failed %d\n", n); + return -1; + } + } while (!n); + + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "completion with error %d\n", wc.status); + return -1; + } + + return 0; } int main(int argc, char *argv[]) @@ -515,7 +692,7 @@ char *servername = NULL; int port = 18515; int ib_port = 1; - int size = 4096; + unsigned long long size = 4096; enum ibv_mtu mtu = IBV_MTU_1024; int rx_depth = 500; int iters = 1000; @@ -525,28 +702,39 @@ int num_cq_events = 0; int sl = 0; int gidx = -1; - char gid[33]; + char gid[INET6_ADDRSTRLEN]; + int inlr_recv = 0; + int check_nop = 0; + int err; srand48(getpid() * time(NULL)); + contig_addr = NULL; while (1) { int c; static struct option long_options[] = { - { .name = "port", .has_arg = 1, .val = 'p' }, - { .name = "ib-dev", .has_arg = 1, .val = 'd' }, - { .name = "ib-port", .has_arg = 1, .val = 'i' }, - { .name = "size", .has_arg = 1, .val = 's' }, - { .name = "mtu", .has_arg = 1, .val = 'm' }, - { .name = "rx-depth", .has_arg = 1, .val = 'r' }, - { .name = "iters", .has_arg = 1, .val = 'n' }, - { .name = "sl", .has_arg = 1, .val = 'l' }, - { .name = "events", .has_arg = 0, .val = 'e' }, - { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "contiguous-mr", .has_arg = 0, .val = 'c' }, + { .name = "inline-recv", .has_arg = 1, .val = 't' }, + { .name = "check-nop", .has_arg = 0, .val = 'a' }, + { .name = "odp", .has_arg = 0, .val = 'o' }, + { .name = "contig_addr", .has_arg = 1, .val = 'z' }, + { .name = "ipv6", .has_arg = 0, .val = '6' }, { 0 } }; - c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:eg:", long_options, NULL); + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:ecg:t:aoz:6", + long_options, NULL); if (c == -1) break; @@ -560,7 +748,7 @@ break; case 'd': - ib_devname = strdup(optarg); + ib_devname = strdupa(optarg); break; case 'i': @@ -572,7 +760,7 @@ break; case 's': - size = strtol(optarg, NULL, 0); + size = strtoll(optarg, NULL, 0); break; case 'm': @@ -603,6 +791,30 @@ gidx = strtol(optarg, NULL, 0); break; + case 'c': + ++use_contiguous_mr; + break; + + case 't': + inlr_recv = strtol(optarg, NULL, 0); + break; + + case 'a': + check_nop = 1; + break; + + case 'o': + use_odp = 1; + break; + + case 'z': + contig_addr = (void *)(uintptr_t)strtol(optarg, NULL, 0); + break; + + case '6': + family = AF_INET6; + break; + default: usage(argv[0]); return 1; @@ -610,12 +822,17 @@ } if (optind == argc - 1) - servername = strdup(argv[optind]); + servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; } + if (contig_addr && !use_contiguous_mr) { + usage(argv[0]); + return 1; + } + page_size = sysconf(_SC_PAGESIZE); dev_list = ibv_get_device_list(NULL); @@ -642,7 +859,7 @@ } } - ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, use_event, !servername); + ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, use_event, inlr_recv); if (!ctx) return 1; @@ -665,14 +882,15 @@ } my_dest.lid = ctx->portinfo.lid; - if (ctx->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND && !my_dest.lid) { + if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && + !my_dest.lid) { fprintf(stderr, "Couldn't get local LID\n"); return 1; } if (gidx >= 0) { if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { - fprintf(stderr, "Could not get local gid for gid index %d\n", gidx); + fprintf(stderr, "can't read sgid of index %d\n", gidx); return 1; } } else @@ -688,7 +906,8 @@ if (servername) rem_dest = pp_client_exch_dest(servername, port, &my_dest); else - rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, &my_dest, gidx); + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, + &my_dest, gidx); if (!rem_dest) return 1; @@ -698,12 +917,20 @@ rem_dest->lid, rem_dest->qpn, rem_dest->psn, gid); if (servername) - if (pp_connect_ctx(ctx, ib_port, my_dest.psn, mtu, sl, rem_dest, gidx)) + if (pp_connect_ctx(ctx, ib_port, my_dest.psn, mtu, sl, rem_dest, + gidx)) return 1; ctx->pending = PINGPONG_RECV_WRID; if (servername) { + if (check_nop) { + err = send_nop(ctx); + if (err) { + fprintf(stderr, "nop operation failed\n"); + return err; + } + } if (pp_post_send(ctx)) { fprintf(stderr, "Couldn't post send\n"); return 1; @@ -741,16 +968,15 @@ } { - struct ibv_wc wc[2]; + struct ibv_exp_wc wc[2]; int ne, i; do { - ne = ibv_poll_cq(ctx->cq, 2, wc); + ne = ibv_exp_poll_cq(ctx->cq, 2, wc, sizeof(wc[0])); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } - } while (!use_event && ne < 1); for (i = 0; i < ne; ++i) { Index: contrib/ofed/libibverbs/examples/shared_mr.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/shared_mr.c @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2012 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DATA_BYTE_VALUE 0x7b + +static int page_size; +static char *servername; +static int no_rdma; + +struct shared_mr_context { + struct ibv_context *context; + struct ibv_pd *pd; + struct ibv_mr *mr; + void *buf; + size_t size; + int sockfd; +}; + +struct shared_mr_info { + uint32_t mr_handle; +}; + +static inline unsigned long align(unsigned long val, unsigned long align) +{ + return (val + align - 1) & ~(align - 1); +} + +static struct shared_mr_info *shared_mr_client_exch_info( + struct shared_mr_context *ctx, + int port) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000"]; + int n; + int sockfd = -1; + struct shared_mr_info *rem_shared_mr_info = NULL; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, + port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", + servername, port); + return NULL; + } + + if (read(sockfd, msg, sizeof msg) != sizeof msg) { + perror("client read"); + fprintf(stderr, "Couldn't read shared mr ID\n"); + goto out; + } + + + rem_shared_mr_info = malloc(sizeof *rem_shared_mr_info); + if (!rem_shared_mr_info) + goto out; + + sscanf(msg, "%x", &rem_shared_mr_info->mr_handle); + ctx->sockfd = sockfd; + return rem_shared_mr_info; + +out: + close(sockfd); + return NULL; +} + +static int shared_mr_server_exch_info(struct shared_mr_context *ctx, + int port, + const struct shared_mr_info *shared_mr_info) + +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000"]; + int n; + int sockfd = -1, connfd; + + if (asprintf(&service, "%d", port) < 0) + return 1; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return 1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, + &n, sizeof n); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return 1; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, 0); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return 1; + } + + sprintf(msg, "%04x", shared_mr_info->mr_handle); + if (write(connfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send shared mr ID\n"); + goto out; + } + + ctx->sockfd = connfd; + return 0; + + +out: + close(connfd); + return 1; +} + +/* write some data on all buffer than send len to server */ +static int shared_mr_client_write(struct shared_mr_context *ctx) +{ + + memset(ctx->buf, DATA_BYTE_VALUE, ctx->size); + if (write(ctx->sockfd, (char *)(&(ctx->size)), sizeof(ctx->size)) != + sizeof(ctx->size)) + return 1; + fprintf(stderr, "shared_mr_client_write has succeeded\n"); + return 0; +} + + +static int shared_mr_server_read(struct shared_mr_context *ctx) +{ + char msg[sizeof(ctx->size)]; + size_t len; + int i; + char *data_buf; + if (read(ctx->sockfd, msg, sizeof msg) != sizeof msg) { + perror("server read"); + fprintf(stderr, "Couldn't read data len\n"); + return 1; + } + memcpy(&len, msg, sizeof len); + if (ctx->size != len) { + fprintf(stderr, "read data len missmatch (%zu,%zu)\n", + len, ctx->size); + return 1; + } + + data_buf = ctx->buf; + for (i = 0; i < len; i++) { + if (data_buf[i] != DATA_BYTE_VALUE) { + fprintf(stderr, "data mismatch, offset=%d\n", i); + return 1; + } + } + fprintf(stderr, "server - data match\n"); + return 0; +} +static int register_shared_mr(struct shared_mr_context *ctx, + struct shared_mr_info *shared_mr_info) +{ + struct ibv_mr *shared_mr; + uint64_t access = IBV_EXP_ACCESS_LOCAL_WRITE; + struct ibv_exp_reg_shared_mr_in shared_mr_in; + + memset(&shared_mr_in, 0, sizeof(shared_mr_in)); + shared_mr_in.mr_handle = shared_mr_info->mr_handle; + shared_mr_in.pd = ctx->pd; + /* shared_mr_in.addr is NULL as part of memset */ + + if (no_rdma) + access |= IBV_EXP_ACCESS_NO_RDMA; + + shared_mr_in.exp_access = access; + shared_mr = ibv_exp_reg_shared_mr(&shared_mr_in); + if (!shared_mr) { + fprintf(stderr, "Failed via reg shared mr errno=%d\n", errno); + return 1; + } + + fprintf(stderr, "client registered successfully to shared mr %s\n", + no_rdma ? "(non rdma)" : ""); + + ctx->mr = shared_mr; + ctx->buf = shared_mr->addr; + ctx->size = shared_mr->length; + return 0; +} + +static int create_shared_mr(struct shared_mr_context *ctx, + struct shared_mr_info *shared_mr_info) +{ + + struct ibv_exp_reg_mr_in in; + + in.pd = ctx->pd; + in.addr = ctx->buf; + in.length = ctx->size; + in.exp_access = IBV_EXP_ACCESS_LOCAL_WRITE | + IBV_EXP_ACCESS_SHARED_MR_USER_WRITE | + IBV_EXP_ACCESS_SHARED_MR_USER_READ; + if (ctx->buf) { + in.comp_mask = IBV_EXP_REG_MR_CREATE_FLAGS; + in.create_flags = IBV_EXP_REG_MR_CREATE_CONTIG; + } else { + in.exp_access |= IBV_EXP_ACCESS_ALLOCATE_MR; + in.comp_mask = 0; + } + + ctx->mr = ibv_exp_reg_mr(&in); + + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + return 1; + } + + ctx->buf = ctx->mr->addr; + shared_mr_info->mr_handle = ctx->mr->handle; + return 0; +} + +static struct shared_mr_context *shared_mr_init_ctx(struct ibv_device *ib_dev, + size_t size, void *contig_addr) +{ + struct shared_mr_context *ctx; + + ctx = calloc(1, sizeof *ctx); + if (!ctx) + return NULL; + + ctx->size = size; + ctx->buf = contig_addr; + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_ctx; + } + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_device; + } + + return ctx; + +clean_device: + ibv_close_device(ctx->context); + +clean_ctx: + free(ctx); + return NULL; +} + +int shared_mr_close_ctx(struct shared_mr_context *ctx) +{ + + if (ctx->mr) { + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + } + + if (ctx->pd) { + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + } + + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + if (ctx->sockfd) + close(ctx->sockfd); + + free(ctx); + return 0; +} + + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", + argv0); + printf(" %s connect to server at \n", + argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port (default 18515)\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -s, --size= size of message to exchange (default 4096)\n"); + printf(" -n, --no-rdma no rdma on shared mr\n"); + printf(" -a, --contig_addr ask for specific contiguous addr\n"); + +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + struct shared_mr_context *ctx; + struct shared_mr_info shared_mr_info; + struct shared_mr_info *rem_shared_mr_info = NULL; + char *ib_devname = NULL; + int port = 18515; + size_t size = 4096; + int rc = 0; + void *contig_addr = NULL; + + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "no-rdma", .has_arg = 0, .val = 'n' }, + { .name = "contig_addr", .has_arg = 1, .val = 'a' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:s:na:", long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port < 0 || port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdup(optarg); + break; + + case 's': + size = strtoull(optarg, NULL, 0); + break; + + case 'n': + ++no_rdma; + break; + + case 'a': + contig_addr = (void *)(uintptr_t)strtoull(optarg, NULL, 0); + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) + servername = strdup(argv[optind]); + else if (optind < argc) { + usage(argv[0]); + return 1; + } + + page_size = sysconf(_SC_PAGESIZE); + size = align(size, page_size); + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), + ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = shared_mr_init_ctx(ib_dev, size, contig_addr); + if (!ctx) + return 1; + + if (servername) { + rem_shared_mr_info = shared_mr_client_exch_info(ctx, port); + if (!rem_shared_mr_info) { + rc = 1; + goto cleanup; + } + + if (register_shared_mr(ctx, rem_shared_mr_info)) { + rc = 1; + goto cleanup; + } + + if (shared_mr_client_write(ctx)) { + rc = 1; + goto cleanup; + + } + } else { + if (create_shared_mr(ctx, &shared_mr_info)) { + rc = 1; + goto cleanup; + } + if (shared_mr_server_exch_info(ctx, port, &shared_mr_info)) { + rc = 1; + goto cleanup; + } + if (shared_mr_server_read(ctx)) { + rc = 1; + goto cleanup; + + } + } + + +cleanup: + + if (shared_mr_close_ctx(ctx)) + return 1; + + ibv_free_device_list(dev_list); + if (rem_shared_mr_info) + free(rem_shared_mr_info); + + return rc; +} Index: contrib/ofed/libibverbs/examples/srq_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/srq_pingpong.c +++ contrib/ofed/libibverbs/examples/srq_pingpong.c @@ -56,6 +56,7 @@ }; static int page_size; +static int family = AF_INET; struct pingpong_context { struct ibv_context *context; @@ -147,7 +148,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -171,6 +172,8 @@ } for (t = res; t; t = t->ai_next) { + if (t->ai_family != family) + continue; sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) @@ -190,7 +193,8 @@ for (i = 0; i < MAX_QP; ++i) { gid_to_wire_gid(&my_dest[i].gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest[i].lid, + my_dest[i].qpn, my_dest[i].psn, gid); if (write(sockfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); goto out; @@ -214,12 +218,17 @@ n += r; } - sscanf(msg, "%x:%x:%x:%s", - &rem_dest[i].lid, &rem_dest[i].qpn, &rem_dest[i].psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest[i].lid, &rem_dest[i].qpn, + &rem_dest[i].psn, gid); wire_gid_to_gid(gid, &rem_dest[i].gid); } - write(sockfd, "done", sizeof "done"); + if (write(sockfd, "done", sizeof("done")) != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } out: close(sockfd); @@ -235,7 +244,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -259,6 +268,8 @@ } for (t = res; t; t = t->ai_next) { + if (t->ai_family != family) + continue; sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; @@ -305,12 +316,13 @@ n += r; } - sscanf(msg, "%x:%x:%x:%s", - &rem_dest[i].lid, &rem_dest[i].qpn, &rem_dest[i].psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest[i].lid, &rem_dest[i].qpn, + &rem_dest[i].psn, gid); wire_gid_to_gid(gid, &rem_dest[i].gid); } - if (pp_connect_ctx(ctx, ib_port, mtu, sl, my_dest, rem_dest, sgid_idx)) { + if (pp_connect_ctx(ctx, ib_port, mtu, sl, my_dest, rem_dest, + sgid_idx)) { fprintf(stderr, "Couldn't connect to remote QP\n"); free(rem_dest); rem_dest = NULL; @@ -319,7 +331,8 @@ for (i = 0; i < MAX_QP; ++i) { gid_to_wire_gid(&my_dest[i].gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest[i].lid, + my_dest[i].qpn, my_dest[i].psn, gid); if (write(connfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); @@ -328,7 +341,13 @@ } } - read(connfd, msg, sizeof msg); + /* expecting msg "done" */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } out: close(connfd); @@ -350,10 +369,10 @@ ctx->num_qp = num_qp; ctx->rx_depth = rx_depth; - ctx->buf = malloc(roundup(size, page_size)); + ctx->buf = memalign(page_size, size); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); - return NULL; + goto clean_ctx; } memset(ctx->buf, 0, size); @@ -362,14 +381,14 @@ if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); - return NULL; + goto clean_buffer; } if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); - return NULL; + goto clean_device; } } else ctx->channel = NULL; @@ -377,20 +396,20 @@ ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); - return NULL; + goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); - return NULL; + goto clean_pd; } ctx->cq = ibv_create_cq(ctx->context, rx_depth + num_qp, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); - return NULL; + goto clean_mr; } { @@ -404,7 +423,7 @@ ctx->srq = ibv_create_srq(ctx->pd, &attr); if (!ctx->srq) { fprintf(stderr, "Couldn't create SRQ\n"); - return NULL; + goto clean_cq; } } @@ -423,7 +442,7 @@ ctx->qp[i] = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp[i]) { fprintf(stderr, "Couldn't create QP[%d]\n", i); - return NULL; + goto clean_qps; } } @@ -441,11 +460,44 @@ IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP[%d] to INIT\n", i); - return NULL; + goto clean_qps_full; } } return ctx; + +clean_qps_full: + i = num_qp; + +clean_qps: + for (--i; i >= 0; --i) + ibv_destroy_qp(ctx->qp[i]); + + ibv_destroy_srq(ctx->srq); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; } int pp_close_ctx(struct pingpong_context *ctx, int num_qp) @@ -567,6 +619,7 @@ printf(" -l, --sl= service level value\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -g, --gid-idx= local port gid index\n"); + printf(" -6, --ipv6 use IPv6\n"); } int main(int argc, char *argv[]) @@ -614,10 +667,12 @@ { .name = "sl", .has_arg = 1, .val = 'l' }, { .name = "events", .has_arg = 0, .val = 'e' }, { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "ipv6", .has_arg = 0, .val = '6' }, { 0 } }; - c = getopt_long(argc, argv, "p:d:i:s:m:q:r:n:l:eg:", long_options, NULL); + c = getopt_long(argc, argv, "p:d:i:s:m:q:r:n:l:eg:6", + long_options, NULL); if (c == -1) break; @@ -631,7 +686,7 @@ break; case 'd': - ib_devname = strdup(optarg); + ib_devname = strdupa(optarg); break; case 'i': @@ -678,6 +733,10 @@ gidx = strtol(optarg, NULL, 0); break; + case '6': + family = AF_INET6; + break; + default: usage(argv[0]); return 1; @@ -685,7 +744,7 @@ } if (optind == argc - 1) - servername = strdup(argv[optind]); + servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; @@ -753,28 +812,33 @@ my_dest[i].qpn = ctx->qp[i]->qp_num; my_dest[i].psn = lrand48() & 0xffffff; my_dest[i].lid = ctx->portinfo.lid; - if (ctx->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND && !my_dest[i].lid) { + if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET + && !my_dest[i].lid) { fprintf(stderr, "Couldn't get local LID\n"); return 1; } if (gidx >= 0) { - if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest[i].gid)) { - fprintf(stderr, "Could not get local gid for gid index %d\n", gidx); + if (ibv_query_gid(ctx->context, ib_port, gidx, + &my_dest[i].gid)) { + fprintf(stderr, "Could not get local gid for " + "gid index %d\n", gidx); return 1; } } else memset(&my_dest[i].gid, 0, sizeof my_dest[i].gid); inet_ntop(AF_INET6, &my_dest[i].gid, gid, sizeof gid); - printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", - my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn, gid); + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, " + "GID %s\n", my_dest[i].lid, my_dest[i].qpn, + my_dest[i].psn, gid); } if (servername) rem_dest = pp_client_exch_dest(servername, port, my_dest); else - rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, my_dest, gidx); + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, + my_dest, gidx); if (!rem_dest) return 1; @@ -783,12 +847,14 @@ for (i = 0; i < num_qp; ++i) { inet_ntop(AF_INET6, &rem_dest[i].gid, gid, sizeof gid); - printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", - rem_dest[i].lid, rem_dest[i].qpn, rem_dest[i].psn, gid); + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, " + "GID %s\n", rem_dest[i].lid, rem_dest[i].qpn, + rem_dest[i].psn, gid); } if (servername) - if (pp_connect_ctx(ctx, ib_port, mtu, sl, my_dest, rem_dest, gidx)) + if (pp_connect_ctx(ctx, ib_port, mtu, sl, my_dest, rem_dest, + gidx)) return 1; if (servername) Index: contrib/ofed/libibverbs/examples/task_pingpong.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/task_pingpong.c @@ -0,0 +1,1202 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2009-2010 Mellanox Technologies. All rights reserved. + */ + +#if HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cc_pingpong.h" + + +enum { + PP_RECV_WRID = 1, + PP_SEND_WRID = 2, + PP_CQE_WAIT = 3, +}; + +char *wr_id_str[] = { + [PP_RECV_WRID] = "RECV", + [PP_SEND_WRID] = "SEND", + [PP_CQE_WAIT] = "CQE_WAIT", +}; + +static long page_size; + +struct pingpong_calc_ctx { + enum ibv_exp_calc_op opcode; + enum ibv_exp_calc_data_type data_type; + enum ibv_exp_calc_data_size data_size; + void *gather_buff; + int gather_list_size; + struct ibv_sge *gather_list; +}; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *scq; + struct ibv_cq *rcq; + struct ibv_qp *qp; + + struct ibv_qp *mqp; + struct ibv_cq *mcq; + + void *buf; + int size; + int rx_depth; + int pending; + + int scnt; + int rcnt; + + struct pingpong_calc_ctx calc_op; +}; + +struct pingpong_dest { + int lid; + int qpn; + int psn; +}; + + +static int pp_connect_ctx(struct pingpong_context *ctx, + struct ibv_qp *qp, + int port, + int my_psn, + enum ibv_mtu mtu, + int sl, + struct pingpong_dest *dest) +{ + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = mtu, + .dest_qp_num = dest->qpn, + .rq_psn = dest->psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = dest->lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + } + }; + + if (ibv_modify_qp(qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = my_psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp(qp, &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + +static struct pingpong_dest *pp_client_exch_dest(const char *servername, + int port, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000"]; + int n; + int sockfd = -1; + struct pingpong_dest *rem_dest = NULL; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return NULL; + } + + sprintf(msg, "%04x:%06x:%06x", my_dest->lid, my_dest->qpn, my_dest->psn); + if (write(sockfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + if (read(sockfd, msg, sizeof msg) != sizeof msg) { + perror("client read"); + fprintf(stderr, "Couldn't read remote address\n"); + goto out; + } + + if (write(sockfd, "done", sizeof "done") != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn); + +out: + close(sockfd); + return rem_dest; +} + +static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx, + int ib_port, + enum ibv_mtu mtu, + int port, + int sl, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000"]; + int n; + int sockfd = -1, connfd; + struct pingpong_dest *rem_dest = NULL; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return NULL; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, 0); + close(sockfd); + + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + n = read(connfd, msg, sizeof msg); + if (n != sizeof msg) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", + n, (int) sizeof msg); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn); + + if (pp_connect_ctx(ctx, ctx->qp, ib_port, my_dest->psn, mtu, + sl, rem_dest)) { + fprintf(stderr, "Couldn't connect to remote QP\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + sprintf(msg, "%04x:%06x:%06x", my_dest->lid, my_dest->qpn, + my_dest->psn); + if (write(connfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + /* expecting msg "done" */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + +out: + close(connfd); + return rem_dest; +} + + + +int __parse_calc_to_gather(char *ops_str, + enum pp_wr_calc_op calc_op, + enum pp_wr_data_type data_type, + int op_per_gather, + int max_num_operands, uint32_t lkey, + struct pingpong_calc_ctx *calc_ctx, void *buff) +{ + + int i, gather_ix, num_operands; + int sz; + char *__gather_token, *__err_ptr = NULL; + struct ibv_sge *gather_list = NULL; + + if (!ops_str) { + fprintf(stderr, "You must choose an operation to perform.\n"); + return -1; + } + + sz = pp_data_type_to_size(data_type); + + for (i = 0, num_operands = 1; i < strlen(ops_str); i++) { + if (ops_str[i] == ',') + num_operands++; + } + + calc_ctx->gather_list_size = num_operands; + + __gather_token = strtok(ops_str, ","); + if (!__gather_token) + return -1; + + gather_list = calloc(num_operands/op_per_gather + (num_operands%op_per_gather ? 1 : 0), + sizeof *gather_list); + if (!gather_list) + return -1; + + for (i = 0, gather_ix = 0; i < num_operands; i++) { + if (!(i % op_per_gather)) { + gather_list[gather_ix].addr = (uint64_t)(uintptr_t)buff + + (sz+8)*i; + gather_list[gather_ix].length = (sz+8)*op_per_gather; + gather_list[gather_ix].lkey = lkey; + + gather_ix++; + } + + switch (data_type) { + case PP_DATA_TYPE_INT8: + goto __gather_out; + + case PP_DATA_TYPE_INT16: + goto __gather_out; + + case PP_DATA_TYPE_INT32: + goto __gather_out; + break; + + case PP_DATA_TYPE_INT64: + *((int64_t *)buff + i*2) = strtoll(__gather_token, + &__err_ptr, 0); + break; + + case PP_DATA_TYPE_FLOAT32: + goto __gather_out; + + case PP_DATA_TYPE_FLOAT64: + goto __gather_out; + break; + + default: + goto __gather_out; + } + + __gather_token = strtok(NULL, ","); + if (!__gather_token) + break; + + } + + calc_ctx->gather_buff = buff; + calc_ctx->gather_list = gather_list; + + return num_operands; + +__gather_out: + if (gather_list) + free(gather_list); + + return -1; +} + + +static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, + int size, int rx_depth, + int port, + enum pp_wr_calc_op calc_op, + enum pp_wr_data_type calc_data_type, + char *calc_operands_str) +{ + struct pingpong_context *ctx; + int rc; + + ctx = malloc(sizeof *ctx); + if (!ctx) + return NULL; + memset(ctx, 0, sizeof *ctx); + + ctx->size = size; + ctx->rx_depth = rx_depth; + + ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; + ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; + ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; + + ctx->buf = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } + + memset(ctx->buf, 0, size); + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_buffer; + } + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_device; + } + + ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size, IBV_ACCESS_LOCAL_WRITE); + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + goto clean_pd; + } + + if (calc_op != PP_CALC_INVALID) { + int op_per_gather, max_num_op; + + ctx->calc_op.opcode = IBV_EXP_CALC_OP_ADD; + ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_INT; + ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_64_BIT; + + rc = pp_query_calc_cap(ctx->context, + ctx->calc_op.opcode, + ctx->calc_op.data_type, + ctx->calc_op.data_size, + &op_per_gather, &max_num_op); + if (rc) { + fprintf(stderr, "-E- operation not supported on %s. valid ops are:\n", + ibv_get_device_name(ib_dev)); + + pp_print_dev_calc_ops(ctx->context); + goto clean_mr; + } + + if (__parse_calc_to_gather(calc_operands_str, calc_op, calc_data_type, + op_per_gather, max_num_op, ctx->mr->lkey, + &ctx->calc_op, ctx->buf) < 0) + goto clean_mr; + } + + { + struct ibv_exp_cq_attr attr = { + .comp_mask = IBV_EXP_CQ_ATTR_CQ_CAP_FLAGS, + .cq_cap_flags = IBV_EXP_CQ_IGNORE_OVERRUN + }; + + ctx->rcq = ibv_create_cq(ctx->context, rx_depth, NULL, NULL, 0); + if (!ctx->rcq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_mr; + } + + if (ibv_exp_modify_cq(ctx->rcq, &attr, IBV_EXP_CQ_CAP_FLAGS)) { + fprintf(stderr, "Failed to modify CQ\n"); + goto clean_rcq; + } + } + + { + struct ibv_exp_cq_attr attr = { + .comp_mask = IBV_EXP_CQ_ATTR_CQ_CAP_FLAGS, + .cq_cap_flags = IBV_EXP_CQ_IGNORE_OVERRUN + }; + + ctx->scq = ibv_create_cq(ctx->context, 0x10, NULL, NULL, 0); + if (!ctx->scq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_rcq; + } + + if (ibv_exp_modify_cq(ctx->scq, &attr, IBV_EXP_CQ_CAP_FLAGS)) { + fprintf(stderr, "Failed to modify CQ\n"); + goto clean_scq; + } + } + + { + struct ibv_exp_qp_init_attr attr = { + .send_cq = ctx->scq, + .recv_cq = ctx->rcq, + .cap = { + .max_send_wr = 16, + .max_recv_wr = rx_depth, + .max_send_sge = 16, + .max_recv_sge = 16 + }, + .qp_type = IBV_QPT_RC, + .pd = ctx->pd + }; + + { + attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; + attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL | IBV_EXP_QP_CREATE_MANAGED_SEND; + ctx->qp = ibv_exp_create_qp(ctx->context, &attr); + } + if (!ctx->qp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_scq; + } + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_qp; + } + } + + + /* Create MQ */ + ctx->mcq = ibv_create_cq(ctx->context, 0x40, NULL, NULL, 0); + if (!ctx->mcq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_qp; + } + + { + struct ibv_exp_qp_init_attr attr = { + .send_cq = ctx->mcq, + .recv_cq = ctx->mcq, + .cap = { + .max_send_wr = 0x40, + .max_recv_wr = 0, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_RC, + .pd = ctx->pd + }; + + { + attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; + attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; + ctx->mqp = ibv_exp_create_qp(ctx->context, &attr); + } + if (!ctx->mqp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_mcq; + } + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->mqp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_mqp; + } + } + + { + struct ibv_qp_attr qp_attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = 1, + .dest_qp_num = ctx->mqp->qp_num, + .rq_psn = 0, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = 0, + .sl = 0, + .src_path_bits = 0, + .port_num = port + } + }; + if (ibv_modify_qp(ctx->mqp, &qp_attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + goto clean_mqp; + } + + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.timeout = 14; + qp_attr.retry_cnt = 7; + qp_attr.rnr_retry = 7; + qp_attr.sq_psn = 0; + qp_attr.max_rd_atomic = 1; + if (ibv_modify_qp(ctx->mqp, &qp_attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + goto clean_mqp; + } + } + + return ctx; + +clean_mqp: + ibv_destroy_qp(ctx->mqp); + +clean_mcq: + ibv_destroy_cq(ctx->mcq); + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_scq: + ibv_destroy_cq(ctx->scq); + +clean_rcq: + ibv_destroy_cq(ctx->rcq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + + +int pp_close_ctx(struct pingpong_context *ctx) +{ + if (ibv_destroy_qp(ctx->mqp)) { + fprintf(stderr, "Couldn't destroy mQP\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->mcq)) { + fprintf(stderr, "Couldn't destroy mCQ\n"); + return 1; + } + + if (ibv_destroy_qp(ctx->qp)) { + fprintf(stderr, "Couldn't destroy QP\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->rcq)) { + fprintf(stderr, "Couldn't destroy rCQ\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->scq)) { + fprintf(stderr, "Couldn't destroy sCQ\n"); + return 1; + } + + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + free(ctx->buf); + free(ctx); + + return 0; +} + + +static int pp_post_recv(struct pingpong_context *ctx, int n) +{ + int rc; + + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = PP_RECV_WRID, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) { + rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); + if (rc) + return rc; + } + + return i; +} + +static int pp_post_send(struct pingpong_context *ctx, int wait_recv) +{ + int rc; + struct ibv_exp_task task_post, task_en, task_wait, *task_p; + struct ibv_wc mwc; + struct ibv_wc wc; + int ne; + + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + + struct ibv_exp_send_wr wr = { + .wr_id = PP_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .exp_opcode = IBV_EXP_WR_SEND, + .exp_send_flags = IBV_EXP_SEND_SIGNALED, + }; + + struct ibv_exp_send_wr wr_en = { + .wr_id = wr.wr_id, + .sg_list = NULL, + .num_sge = 0, + .exp_opcode = IBV_EXP_WR_SEND_ENABLE, + .exp_send_flags = (wait_recv ? 0 : IBV_EXP_SEND_SIGNALED), + }; + + struct ibv_exp_send_wr wr_wait = { + .wr_id = ctx->scnt, + .sg_list = NULL, + .num_sge = 0, + .exp_opcode = IBV_EXP_WR_CQE_WAIT, + .exp_send_flags = IBV_EXP_SEND_SIGNALED, + }; + + /* fill in send work calc request */ + if (ctx->calc_op.opcode != IBV_EXP_CALC_OP_NUMBER) { + wr.exp_opcode = IBV_EXP_WR_SEND; + wr.exp_send_flags |= IBV_EXP_SEND_WITH_CALC; + wr.sg_list = ctx->calc_op.gather_list; + wr.num_sge = ctx->calc_op.gather_list_size; + + wr.op.calc.calc_op = ctx->calc_op.opcode; + wr.op.calc.data_type = ctx->calc_op.data_type; + wr.op.calc.data_size = ctx->calc_op.data_size; + wr.next = NULL; + } + + memset(&task_post, 0, sizeof(task_post)); + task_post.task_type = IBV_EXP_TASK_SEND; + task_post.item.qp = ctx->qp; + task_post.item.send_wr = ≀ + + task_post.next = NULL; + task_p = &task_post; + + /* fill in send work enable request */ + { + wr_en.task.wqe_enable.qp = ctx->qp; + wr_en.task.wqe_enable.wqe_count = 0; + + wr_en.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; + + memset(&task_en, 0, sizeof(task_en)); + task_en.task_type = IBV_EXP_TASK_SEND; + task_en.item.qp = ctx->mqp; + task_en.item.send_wr = &wr_en; + + task_en.next = NULL; + task_post.next = &task_en; + } + + /* fill in wait work enable request */ + if (wait_recv) { + wr_wait.task.cqe_wait.cq = ctx->rcq; + wr_wait.task.cqe_wait.cq_count = 1; + + wr_wait.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; + wr_wait.next = NULL; + + memset(&task_wait, 0, sizeof(task_wait)); + task_wait.task_type = IBV_EXP_TASK_SEND; + task_wait.item.qp = ctx->mqp; + task_wait.item.send_wr = &wr_wait; + + task_wait.next = &task_post; + task_p = &task_wait; + } + + + rc = ibv_exp_post_task(ctx->context, task_p, NULL); + if (rc) + return rc; + + do { + rc = ibv_poll_cq(ctx->mcq, 1, &mwc); + if (rc < 0) + return -1; + } while (rc == 0); + + if (mwc.status != IBV_WC_SUCCESS) + return -1; + + do { + ne = ibv_poll_cq(ctx->scq, 1, &wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + } while (!ne); + + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "cqe error status %s (%d v:%d) for count %d\n", + ibv_wc_status_str(wc.status), + wc.status, wc.vendor_err, + ctx->rcnt); + return 1; + } + + return 0; +} + + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", + argv0); + printf(" %s connect to server at \n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port" + " (default 18515)\n"); + printf(" -d, --ib-dev= use IB device " + "(default first device found)\n"); + printf(" -i, --ib-port= use port of IB device" + " (default 1)\n"); + printf(" -s, --size= size of message to exchange " + "(default 4096 minimum 16)\n"); + printf(" -m, --mtu= path MTU (default 1024)\n"); + printf(" -r, --rx-depth= number of receives to post" + " at a time (default 500)\n"); + printf(" -n, --iters= number of exchanges" + " (default 1000)\n"); + printf(" -l, --sl= service level value\n"); + printf(" -e, --events sleep on CQ events" + " (default poll)\n"); + printf(" -c, --calc calc operation (supported ADD)\n"); + printf(" -t, --op_type= calc operands type (supported INT64)\n"); + printf(" -o, --operands= comma separated list of" + " operands\n"); + printf(" -w, --wait_cq=cqn wait for entries on cq\n"); +} + + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev = NULL; + struct pingpong_context *ctx; + struct pingpong_dest my_dest; + struct pingpong_dest *rem_dest; + struct timeval start, end; + char *ib_devname = NULL; + char *servername = NULL; + int port = 18515; + int ib_port = 1; + int size = 4096; + + enum ibv_mtu mtu = IBV_MTU_1024; + int rx_depth = 500; + int iters = 1000; + int routs; + int num_cq_events = 0; + int sl = 0; + + enum pp_wr_data_type calc_data_type = PP_DATA_TYPE_INVALID; + enum pp_wr_calc_op calc_opcode = PP_CALC_INVALID; + char *calc_operands_str = NULL; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "calc", .has_arg = 1, .val = 'c' }, + { .name = "op_type", .has_arg = 1, .val = 't' }, + { .name = "operands", .has_arg = 1, .val = 'o' }, + { .name = "poll_mqe", .has_arg = 0, .val = 'w' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:et:c:o:wf", + long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port < 0 || port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtol(optarg, NULL, 0); + if (size < 16) { + usage(argv[0]); + return 1; + } + break; + + case 'm': + mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (mtu < 0) { + usage(argv[0]); + return 1; + } + break; + + case 'r': + rx_depth = strtol(optarg, NULL, 0); + break; + + case 'n': + iters = strtol(optarg, NULL, 0); + break; + + case 'l': + sl = strtol(optarg, NULL, 0); + break; + + case 't': + calc_data_type = pp_str_to_data_type(optarg); + if (calc_data_type == PP_DATA_TYPE_INVALID) { + printf("-E- invalid data types. Valid values are:\n"); + pp_print_data_type(); + return 1; + } + break; + + case 'o': + calc_operands_str = strdup(optarg); + break; + + case 'c': + calc_opcode = pp_str_to_calc_op(optarg); + if (calc_opcode == PP_CALC_INVALID) { + printf("-E- invalid data types. Valid values are:\n"); + pp_print_calc_op(); + return 1; + } + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) + servername = strdupa(argv[optind]); + else if (optind < argc) { + usage(argv[0]); + return 1; + } + + page_size = sysconf(_SC_PAGESIZE); + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), + ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, + calc_opcode, calc_data_type, calc_operands_str); + if (!ctx) + return 1; + + routs = pp_post_recv(ctx, ctx->rx_depth); + if (routs < ctx->rx_depth) { + fprintf(stderr, "Couldn't post receive (%d)\n", routs); + return 1; + } + + my_dest.lid = pp_get_local_lid(ctx->context, ib_port); + my_dest.qpn = ctx->qp->qp_num; + my_dest.psn = lrand48() & 0xffffff; + if (!my_dest.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x\n", + my_dest.lid, my_dest.qpn, my_dest.psn); + + if (servername) + rem_dest = pp_client_exch_dest(servername, port, &my_dest); + else + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, + port, sl, &my_dest); + + if (!rem_dest) + return 1; + + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x\n", + rem_dest->lid, rem_dest->qpn, rem_dest->psn); + + if (servername) + if (pp_connect_ctx(ctx, ctx->qp, ib_port, my_dest.psn, mtu, + sl, rem_dest)) + return 1; + + if (servername) { + if (pp_post_send(ctx, 0)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + + if (gettimeofday(&start, NULL)) { + perror("gettimeofday"); + return 1; + } + } + + ctx->scnt = ctx->rcnt = 0; + while (ctx->rcnt < iters && ctx->scnt < iters) { + struct ibv_wc wc; + int ne; + + do { + ne = ibv_poll_cq(ctx->rcq, 1, &wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + } while (ne < 1); + + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "cqe error status %s (%d v:%d)" + " for count %d\n", + ibv_wc_status_str(wc.status), + wc.status, wc.vendor_err, + ctx->rcnt); + return 1; + } + + ctx->rcnt++; + + if (pp_post_recv(ctx, 1) < 0) { + fprintf(stderr, "Couldn't post receive\n"); + return 1; + } + + if (pp_post_send(ctx, 1)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + } + + + if (gettimeofday(&end, NULL)) { + perror("gettimeofday"); + return 1; + } + + { + float usec = (end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_usec - start.tv_usec); + long long bytes = (long long) size * iters * 2; + + printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", + bytes, usec / 1000000., bytes * 8. / usec); + printf("%d iters in %.2f seconds = %.2f usec/iter\n", + iters, usec / 1000000., usec / iters); + } + + ibv_ack_cq_events(ctx->rcq, num_cq_events); + + if (pp_close_ctx(ctx)) + return 1; + + ibv_free_device_list(dev_list); + if (calc_operands_str) + free(calc_operands_str); + + free(rem_dest); + + return 0; +} Index: contrib/ofed/libibverbs/examples/uc_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/uc_pingpong.c +++ contrib/ofed/libibverbs/examples/uc_pingpong.c @@ -54,6 +54,7 @@ }; static int page_size; +static int family = AF_INET; struct pingpong_context { struct ibv_context *context; @@ -66,7 +67,7 @@ int size; int rx_depth; int pending; - struct ibv_port_attr portinfo; + struct ibv_port_attr portinfo; }; struct pingpong_dest { @@ -100,7 +101,6 @@ attr.ah_attr.grh.dgid = dest->gid; attr.ah_attr.grh.sgid_index = sgid_idx; } - if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_AV | @@ -128,7 +128,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -150,6 +150,8 @@ } for (t = res; t; t = t->ai_next) { + if (t->ai_family != family) + continue; sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) @@ -168,7 +170,8 @@ } gid_to_wire_gid(&my_dest->gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, my_dest->psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); if (write(sockfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); goto out; @@ -180,13 +183,17 @@ goto out; } - write(sockfd, "done", sizeof "done"); + if (write(sockfd, "done", sizeof("done")) != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + goto out; + } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; - sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); wire_gid_to_gid(gid, &rem_dest->gid); out: @@ -203,7 +210,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -265,18 +272,22 @@ if (!rem_dest) goto out; - sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); wire_gid_to_gid(gid, &rem_dest->gid); - if (pp_connect_ctx(ctx, ib_port, my_dest->psn, mtu, sl, rem_dest, sgid_idx)) { + if (pp_connect_ctx(ctx, ib_port, my_dest->psn, mtu, sl, rem_dest, + sgid_idx)) { fprintf(stderr, "Couldn't connect to remote QP\n"); free(rem_dest); rem_dest = NULL; goto out; } + gid_to_wire_gid(&my_dest->gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, my_dest->psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); if (write(connfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); @@ -284,7 +295,13 @@ goto out; } - read(connfd, msg, sizeof msg); + /* expecting msg "done" */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } out: close(connfd); @@ -304,26 +321,27 @@ ctx->size = size; ctx->rx_depth = rx_depth; - ctx->buf = malloc(roundup(size, page_size)); + ctx->buf = memalign(page_size, size); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); - return NULL; + goto clean_ctx; } - memset(ctx->buf, 0, size); + /* FIXME memset(ctx->buf, 0, size); */ + memset(ctx->buf, 0x7b, size); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); - return NULL; + goto clean_buffer; } if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); - return NULL; + goto clean_device; } } else ctx->channel = NULL; @@ -331,20 +349,20 @@ ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); - return NULL; + goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); - return NULL; + goto clean_pd; } ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); - return NULL; + goto clean_mr; } { @@ -363,7 +381,7 @@ ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); - return NULL; + goto clean_cq; } } @@ -381,11 +399,38 @@ IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); - return NULL; + goto clean_qp; } } return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; } int pp_close_ctx(struct pingpong_context *ctx) @@ -486,6 +531,7 @@ printf(" -l, --sl= service level value\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -g, --gid-idx= local port gid index\n"); + printf(" -6, --ipv6 use IPv6\n"); } int main(int argc, char *argv[]) @@ -528,10 +574,12 @@ { .name = "sl", .has_arg = 1, .val = 'l' }, { .name = "events", .has_arg = 0, .val = 'e' }, { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "ipv6", .has_arg = 0, .val = '6' }, { 0 } }; - c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:eg:", long_options, NULL); + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:eg:6", + long_options, NULL); if (c == -1) break; @@ -545,7 +593,7 @@ break; case 'd': - ib_devname = strdup(optarg); + ib_devname = strdupa(optarg); break; case 'i': @@ -588,6 +636,10 @@ gidx = strtol(optarg, NULL, 0); break; + case '6': + family = AF_INET6; + break; + default: usage(argv[0]); return 1; @@ -595,7 +647,7 @@ } if (optind == argc - 1) - servername = strdup(argv[optind]); + servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; @@ -643,20 +695,22 @@ return 1; } + if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) { fprintf(stderr, "Couldn't get port info\n"); return 1; } my_dest.lid = ctx->portinfo.lid; - if (ctx->portinfo.link_layer == IBV_LINK_LAYER_INFINIBAND && !my_dest.lid) { + if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && + !my_dest.lid) { fprintf(stderr, "Couldn't get local LID\n"); return 1; } if (gidx >= 0) { if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { - fprintf(stderr, "Could not get local gid for gid index %d\n", gidx); + fprintf(stderr, "can't read sgid of index %d\n", gidx); return 1; } } else @@ -668,10 +722,12 @@ printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", my_dest.lid, my_dest.qpn, my_dest.psn, gid); + if (servername) rem_dest = pp_client_exch_dest(servername, port, &my_dest); else - rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, &my_dest, gidx); + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, + &my_dest, gidx); if (!rem_dest) return 1; @@ -681,7 +737,8 @@ rem_dest->lid, rem_dest->qpn, rem_dest->psn, gid); if (servername) - if (pp_connect_ctx(ctx, ib_port, my_dest.psn, mtu, sl, rem_dest, gidx)) + if (pp_connect_ctx(ctx, ib_port, my_dest.psn, mtu, sl, rem_dest, + gidx)) return 1; ctx->pending = PINGPONG_RECV_WRID; @@ -733,6 +790,7 @@ fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } + } while (!use_event && ne < 1); for (i = 0; i < ne; ++i) { Index: contrib/ofed/libibverbs/examples/ud_pingpong.c =================================================================== --- contrib/ofed/libibverbs/examples/ud_pingpong.c +++ contrib/ofed/libibverbs/examples/ud_pingpong.c @@ -54,6 +54,7 @@ }; static int page_size; +static int family = AF_INET; struct pingpong_context { struct ibv_context *context; @@ -127,7 +128,7 @@ { struct addrinfo *res, *t; struct addrinfo hints = { - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -149,6 +150,8 @@ } for (t = res; t; t = t->ai_next) { + if (t->ai_family != family) + continue; sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) @@ -167,25 +170,30 @@ } gid_to_wire_gid(&my_dest->gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, my_dest->psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); if (write(sockfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); goto out; } - if (read(sockfd, msg, sizeof msg) != sizeof msg) { + if (recv(sockfd, msg, sizeof(msg), MSG_WAITALL) != sizeof(msg)) { perror("client read"); fprintf(stderr, "Couldn't read remote address\n"); goto out; } - write(sockfd, "done", sizeof "done"); + if (write(sockfd, "done", sizeof("done")) != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + goto out; + } rem_dest = malloc(sizeof *rem_dest); if (!rem_dest) goto out; - sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); wire_gid_to_gid(gid, &rem_dest->gid); out: @@ -201,7 +209,7 @@ struct addrinfo *res, *t; struct addrinfo hints = { .ai_flags = AI_PASSIVE, - .ai_family = AF_INET, + .ai_family = family, .ai_socktype = SOCK_STREAM }; char *service; @@ -223,6 +231,8 @@ } for (t = res; t; t = t->ai_next) { + if (t->ai_family != family) + continue; sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (sockfd >= 0) { n = 1; @@ -252,7 +262,7 @@ return NULL; } - n = read(connfd, msg, sizeof msg); + n = recv(connfd, msg, sizeof(msg), MSG_WAITALL); if (n != sizeof msg) { perror("server read"); fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); @@ -263,10 +273,12 @@ if (!rem_dest) goto out; - sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, &rem_dest->psn, gid); + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); wire_gid_to_gid(gid, &rem_dest->gid); - if (pp_connect_ctx(ctx, ib_port, my_dest->psn, sl, rem_dest, sgid_idx)) { + if (pp_connect_ctx(ctx, ib_port, my_dest->psn, sl, rem_dest, + sgid_idx)) { fprintf(stderr, "Couldn't connect to remote QP\n"); free(rem_dest); rem_dest = NULL; @@ -274,7 +286,8 @@ } gid_to_wire_gid(&my_dest->gid, gid); - sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, my_dest->psn, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); if (write(connfd, msg, sizeof msg) != sizeof msg) { fprintf(stderr, "Couldn't send local address\n"); free(rem_dest); @@ -282,7 +295,13 @@ goto out; } - read(connfd, msg, sizeof msg); + /* expecting msg "done" */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } out: close(connfd); @@ -302,26 +321,27 @@ ctx->size = size; ctx->rx_depth = rx_depth; - ctx->buf = malloc(roundup(size + 40, page_size)); + ctx->buf = memalign(page_size, size + 40); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); - return NULL; + goto clean_ctx; } - memset(ctx->buf, 0, size + 40); + /* FIXME memset(ctx->buf, 0, size + 40); */ + memset(ctx->buf, 0x7b, size + 40); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); - return NULL; + goto clean_buffer; } if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); - return NULL; + goto clean_device; } } else ctx->channel = NULL; @@ -329,20 +349,20 @@ ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); - return NULL; + goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size + 40, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); - return NULL; + goto clean_pd; } ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); - return NULL; + goto clean_mr; } { @@ -361,7 +381,7 @@ ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); - return NULL; + goto clean_cq; } } @@ -379,11 +399,38 @@ IBV_QP_PORT | IBV_QP_QKEY)) { fprintf(stderr, "Failed to modify QP to INIT\n"); - return NULL; + goto clean_qp; } } return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; } int pp_close_ctx(struct pingpong_context *ctx) @@ -494,6 +541,7 @@ printf(" -n, --iters= number of exchanges (default 1000)\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -g, --gid-idx= local port gid index\n"); + printf(" -6, --ipv6 use IPv6\n"); } int main(int argc, char *argv[]) @@ -516,8 +564,8 @@ int rcnt, scnt; int num_cq_events = 0; int sl = 0; - int gidx = -1; - char gid[33]; + int gidx = -1; + char gid[INET6_ADDRSTRLEN]; srand48(getpid() * time(NULL)); @@ -534,10 +582,12 @@ { .name = "sl", .has_arg = 1, .val = 'l' }, { .name = "events", .has_arg = 0, .val = 'e' }, { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "ipv6", .has_arg = 0, .val = '6' }, { 0 } }; - c = getopt_long(argc, argv, "p:d:i:s:r:n:l:eg:", long_options, NULL); + c = getopt_long(argc, argv, "p:d:i:s:r:n:l:eg:6", + long_options, NULL); if (c == -1) break; @@ -551,7 +601,7 @@ break; case 'd': - ib_devname = strdup(optarg); + ib_devname = strdupa(optarg); break; case 'i': @@ -586,6 +636,10 @@ gidx = strtol(optarg, NULL, 0); break; + case '6': + family = AF_INET6; + break; + default: usage(argv[0]); return 1; @@ -593,7 +647,7 @@ } if (optind == argc - 1) - servername = strdup(argv[optind]); + servername = strdupa(argv[optind]); else if (optind < argc) { usage(argv[0]); return 1; @@ -652,7 +706,8 @@ if (gidx >= 0) { if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { - fprintf(stderr, "Could not get local gid for gid index %d\n", gidx); + fprintf(stderr, "Could not get local gid for gid index " + "%d\n", gidx); return 1; } } else @@ -665,7 +720,8 @@ if (servername) rem_dest = pp_client_exch_dest(servername, port, &my_dest); else - rem_dest = pp_server_exch_dest(ctx, ib_port, port, sl, &my_dest, gidx); + rem_dest = pp_server_exch_dest(ctx, ib_port, port, sl, + &my_dest, gidx); if (!rem_dest) return 1; @@ -675,7 +731,8 @@ rem_dest->lid, rem_dest->qpn, rem_dest->psn, gid); if (servername) - if (pp_connect_ctx(ctx, ib_port, my_dest.psn, sl, rem_dest, gidx)) + if (pp_connect_ctx(ctx, ib_port, my_dest.psn, sl, rem_dest, + gidx)) return 1; ctx->pending = PINGPONG_RECV_WRID; Index: contrib/ofed/libibverbs/examples/umr_rc.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/umr_rc.c @@ -0,0 +1,1248 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pingpong.h" + +#ifndef min +#define min(a, b) (((a) < (b)) ? (a) : (b)) +#endif + +enum { + UMR_RECV_WRID = 1, + UMR_SEND_WRID = 2, +}; + +static int page_size; +static int use_contiguous_mr; + +struct umr_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr **mr_arr; + int num_mrs; + struct ibv_mr *umr; + struct ibv_exp_mkey_list_container *mkey_list_container; + struct ibv_cq *cq; + struct ibv_qp *qp; + void **buf; + int size; + int rx_depth; + int pending; + struct ibv_port_attr portinfo; + int inlr_recv; +}; + +struct umr_dest { + int lid; + int qpn; + int psn; + union ibv_gid gid; +}; + +static int pp_connect_ctx(struct umr_context *ctx, int port, int my_psn, + enum ibv_mtu mtu, int sl, + struct umr_dest *dest, int sgid_idx) +{ + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = mtu, + .dest_qp_num = dest->qpn, + .rq_psn = dest->psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = dest->lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + } + }; + + if (dest->gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = dest->gid; + attr.ah_attr.grh.sgid_index = sgid_idx; + } + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = my_psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + +static struct umr_dest *pp_client_exch_dest(const char *servername, int port, + const struct umr_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1; + struct umr_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return NULL; + } + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(sockfd, msg, sizeof(msg)) != sizeof(msg)) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + if (read(sockfd, msg, sizeof(msg)) != sizeof(msg)) { + perror("client read"); + fprintf(stderr, "Couldn't read remote address\n"); + goto out; + } + + if (write(sockfd, "done", sizeof("done")) != sizeof("done")) { + fprintf(stderr, "Couldn't send \"done\" msg\n"); + goto out; + } + + rem_dest = malloc(sizeof(*rem_dest)); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + +out: + close(sockfd); + return rem_dest; +} + +static struct umr_dest *pp_server_exch_dest(struct umr_context *ctx, + int ib_port, enum ibv_mtu mtu, + int port, int sl, + const struct umr_dest *my_dest, + int sgid_idx) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1, connfd; + struct umr_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof(n)); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return NULL; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, 0); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + n = read(connfd, msg, sizeof(msg)); + if (n != sizeof(msg)) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof(msg)); + goto out; + } + + rem_dest = malloc(sizeof(*rem_dest)); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + + if (pp_connect_ctx(ctx, ib_port, my_dest->psn, mtu, sl, rem_dest, + sgid_idx)) { + fprintf(stderr, "Couldn't connect to remote QP\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(connfd, msg, sizeof(msg)) != sizeof(msg)) { + fprintf(stderr, "Couldn't send local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + /* expecting msg "done" */ + if (read(connfd, msg, sizeof(msg)) <= 0) { + fprintf(stderr, "Couldn't read \"done\" msg\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + +out: + close(connfd); + return rem_dest; +} + +static int create_umr(struct umr_context *ctx, int num_mrs, + int use_repeat_block, int umr_ninl_send, + int list_length, int size, int rb_len, + int rb_stride, int rb_count) +{ + struct ibv_exp_create_mr_in mrin; + struct ibv_exp_mem_region *mem_reg_list = NULL; + struct ibv_exp_mem_repeat_block *mem_rep_list = NULL; + struct ibv_exp_send_wr wr; + struct ibv_exp_send_wr *bad_wr; + int rc; + int i, err = 0; + int umr_len = 0; + struct ibv_exp_wc wc; + int ne; + int ndim = 1; + size_t *rpt_cnt = NULL; + + if (use_repeat_block) { + mem_rep_list = calloc(num_mrs, sizeof(*mem_rep_list)); + if (!mem_rep_list) { + fprintf(stderr, "Failed to allocate mkey_list\n"); + return -1; + } + + for (i = 0; i < num_mrs; i++) { + mem_rep_list[i].byte_count = calloc(ndim, sizeof(mem_rep_list[i].byte_count[0])); + if (!mem_rep_list[i].byte_count) + goto clean_mem_reg_list; + + mem_rep_list[i].stride = calloc(ndim, sizeof(mem_rep_list[i].stride[0])); + if (!mem_rep_list[i].stride) + goto clean_mem_reg_list; + } + + rpt_cnt = (size_t *)calloc(ndim, sizeof(*rpt_cnt)); + if (!rpt_cnt) { + fprintf(stderr, "Failed to allocate rpt_cnt\n"); + err = -1; + goto clean_mem_reg_list; + } + + for (i = 0; i < ndim; i++) + rpt_cnt[i] = rb_count; + + for (i = 0; i < num_mrs; i++) { + mem_rep_list[i].base_addr = (uint64_t)(uintptr_t)ctx->mr_arr[i]->addr; + mem_rep_list[i].byte_count[0] = rb_len; + mem_rep_list[i].mr = ctx->mr_arr[i]; + mem_rep_list[i].stride[0] = rb_stride; + + umr_len += rb_count * mem_rep_list[i].byte_count[0]; + } + } else { + mem_reg_list = calloc(num_mrs, sizeof(*mem_reg_list)); + if (!mem_reg_list) { + fprintf(stderr, "Failed to allocate mkey_list\n"); + return -1; + } + + for (i = 0; i < num_mrs; i++) { + mem_reg_list[i].base_addr = (uint64_t)(uintptr_t)ctx->mr_arr[i]->addr; + mem_reg_list[i].length = ctx->mr_arr[i]->length; + mem_reg_list[i].mr = ctx->mr_arr[i]; + umr_len += mem_reg_list[i].length; + } + } + + memset(&mrin, 0, sizeof(mrin)); + mrin.pd = ctx->pd; + mrin.attr.create_flags = IBV_EXP_MR_INDIRECT_KLMS; + mrin.attr.exp_access_flags = IBV_EXP_ACCESS_LOCAL_WRITE; + mrin.attr.max_klm_list_size = num_mrs; + ctx->umr = ibv_exp_create_mr(&mrin); + if (!ctx->umr) { + fprintf(stderr, "Failed to create modified_mr\n"); + err = -1; + goto clean_rpt_cnt; + } + + if (umr_ninl_send) { + struct ibv_exp_mkey_list_container_attr in = { + .pd = ctx->pd, + .mkey_list_type = IBV_EXP_MKEY_LIST_TYPE_INDIRECT_MR, + .max_klm_list_size = list_length + }; + ctx->mkey_list_container = ibv_exp_alloc_mkey_list_memory(&in); + if (!ctx->mkey_list_container) { + fprintf(stderr, "Failed to allocate alloc_mkey_list_memory\n"); + err = -1; + goto clean_umr; + } + } + + memset(&wr, 0, sizeof(wr)); + if (use_repeat_block) { + wr.ext_op.umr.umr_type = IBV_EXP_UMR_REPEAT; + wr.ext_op.umr.mem_list.rb.mem_repeat_block_list = mem_rep_list; + wr.ext_op.umr.mem_list.rb.stride_dim = 1; + wr.ext_op.umr.mem_list.rb.repeat_count = rpt_cnt; + } else { + wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; + wr.ext_op.umr.mem_list.mem_reg_list = mem_reg_list; + } + + if (umr_ninl_send) + wr.ext_op.umr.memory_objects = ctx->mkey_list_container; + else + wr.exp_send_flags = IBV_EXP_SEND_INLINE; + + wr.ext_op.umr.exp_access = IBV_EXP_ACCESS_LOCAL_WRITE; + wr.ext_op.umr.modified_mr = ctx->umr; + wr.ext_op.umr.base_addr = (uint64_t)(uintptr_t)ctx->mr_arr[0]->addr; + wr.ext_op.umr.num_mrs = num_mrs; + wr.exp_send_flags |= IBV_EXP_SEND_SIGNALED; + wr.exp_opcode = IBV_EXP_WR_UMR_FILL; + + rc = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); + if (rc) { + fprintf(stderr, "Failed in ibv_exp_post_send IBV_EXP_WR_UMR_FILL\n"); + err = -1; + goto clean_mkey_list; + } + + ne = 0; + while (!ne) { + ne = ibv_exp_poll_cq(ctx->cq, 1, &wc, sizeof(wc)); + if (ne < 0) { + fprintf(stderr, "poll CQ failed after IBV_EXP_WR_UMR_FILL\n"); + goto invalidate_umr; + } + } + + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "comp status %d\n", wc.status); + goto invalidate_umr; + } + + ctx->umr->length = umr_len; + ctx->umr->addr = (void *)(unsigned long)wr.ext_op.umr.base_addr; + + return err; + +invalidate_umr: + wr.exp_opcode = IBV_EXP_WR_UMR_INVALIDATE; + ibv_exp_post_send(ctx->qp, &wr, &bad_wr); + +clean_mkey_list: + if (umr_ninl_send) + ibv_exp_dealloc_mkey_list_memory(ctx->mkey_list_container); + +clean_umr: + ibv_dereg_mr(ctx->umr); + +clean_rpt_cnt: + if (use_repeat_block) + free(rpt_cnt); + +clean_mem_reg_list: + if (use_repeat_block) { + for (i = 0; i < num_mrs; i++) { + if (mem_rep_list[i].stride) + free(mem_rep_list[i].stride); + if (mem_rep_list[i].byte_count) + free(mem_rep_list[i].byte_count); + } + if (mem_rep_list) + free(mem_rep_list); + } + if (mem_reg_list) + free(mem_reg_list); + + return err; +} + +static struct umr_context *pp_init_ctx(struct ibv_device *ib_dev, int size, + int rx_depth, int port, + int use_event, int inlr_recv, + int num_mrs) +{ + struct umr_context *ctx; + struct ibv_exp_device_attr dattr; + int ret, i; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return NULL; + + memset(&dattr, 0, sizeof(dattr)); + + ctx->num_mrs = num_mrs; + ctx->size = size * num_mrs; + ctx->rx_depth = rx_depth; + + ctx->buf = calloc(num_mrs, sizeof(void *)); + if (!use_contiguous_mr) { + for (i = 0; i < num_mrs; i++) { + ctx->buf[i] = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } + } + } + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_buffer; + } + + if (inlr_recv) + dattr.comp_mask |= IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ; + + ret = ibv_exp_query_device(ctx->context, &dattr); + if (inlr_recv) { + if (ret) { + printf(" Couldn't query device for inline-receive capabilities.\n"); + } else if (!(dattr.comp_mask & IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ)) { + printf(" Inline-receive not supported by driver.\n"); + } else if (dattr.inline_recv_sz < inlr_recv) { + printf(" Max inline-receive(%d) < Requested inline-receive(%d).\n", + dattr.inline_recv_sz, inlr_recv); + } + } + ctx->inlr_recv = inlr_recv; + + memset(&dattr, 0, sizeof(dattr)); + dattr.comp_mask |= IBV_EXP_DEVICE_ATTR_UMR; + ret = ibv_exp_query_device(ctx->context, &dattr); + if (ret) + printf(" Couldn't query device for UMR capabilities.\n"); + + if (use_event) { + ctx->channel = ibv_create_comp_channel(ctx->context); + if (!ctx->channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + goto clean_device; + } + } else { + ctx->channel = NULL; + } + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } + + ctx->mr_arr = calloc(num_mrs, sizeof(struct ibv_mr *)); + for (i = 0; i < num_mrs; i++) { + if (!use_contiguous_mr) { + ctx->mr_arr[i] = ibv_reg_mr(ctx->pd, ctx->buf[i], size, + IBV_ACCESS_LOCAL_WRITE); + } else { + struct ibv_exp_reg_mr_in in; + + memset(&in, 0, sizeof(in)); + in.pd = ctx->pd; + in.addr = NULL; + in.length = size; + in.exp_access = IBV_EXP_ACCESS_LOCAL_WRITE | + IBV_EXP_ACCESS_ALLOCATE_MR; + in.comp_mask = 0; + + ctx->mr_arr[i] = ibv_exp_reg_mr(&in); + } + + if (!ctx->mr_arr[i]) { + fprintf(stderr, "Couldn't register MR num %d\n", i); + goto clean_pd; + } else { + if (use_contiguous_mr) + ctx->buf[i] = ctx->mr_arr[i]->addr; + } + + memset(ctx->buf[i], i, size); + } + + ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, + ctx->channel, 0); + if (!ctx->cq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_mr; + } + + { + struct ibv_exp_qp_init_attr attr = { + .send_cq = ctx->cq, + .recv_cq = ctx->cq, + .cap = { + .max_send_wr = 200, + .max_recv_wr = rx_depth, + .max_send_sge = 2, + .max_recv_sge = 2, + }, + .qp_type = IBV_QPT_RC, + .pd = ctx->pd, + .comp_mask = IBV_EXP_QP_INIT_ATTR_PD, + .max_inl_recv = ctx->inlr_recv, + .max_inl_send_klms = dattr.umr_caps.max_send_wqe_inline_klms + }; + if (ctx->inlr_recv) + attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; + + attr.exp_create_flags |= IBV_EXP_QP_CREATE_UMR; + attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | + IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS; + ctx->qp = ibv_exp_create_qp(ctx->context, &attr); + + if (!ctx->qp) { + fprintf(stderr, "Couldn't create QP, errno = %d\n", errno); + goto clean_cq; + } + if (ctx->inlr_recv > attr.max_inl_recv) + printf(" Actual inline-receive(%d) < requested inline-receive(%d)\n", + attr.max_inl_recv, ctx->inlr_recv); + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_qp; + } + } + + return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + for (i = 0; i < num_mrs; i++) + ibv_dereg_mr(ctx->mr_arr[i]); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + if (!use_contiguous_mr) + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + +int invalidate_umr(struct umr_context *ctx) +{ + struct ibv_exp_send_wr wr, *bad_wr;; + struct ibv_exp_wc wc; + int rc; + int ne; + + if (!ctx->umr) + return 0; + + if (ctx->umr->addr) { + memset(&wr, 0, sizeof(wr)); + wr.exp_opcode = IBV_EXP_WR_UMR_INVALIDATE; + wr.ext_op.umr.modified_mr = ctx->umr; + wr.exp_send_flags = IBV_EXP_SEND_SIGNALED; + rc = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); + if (rc) + printf("failed to invalidate_umr (%d)\n", rc); + } + + do { + ne = ibv_exp_poll_cq(ctx->cq, 1, &wc, sizeof(wc)); + if (ne < 0) { + fprintf(stderr, "poll CQ failed after IBV_EXP_WR_UMR_FILL\n"); + return 1; + } + } while (!ne); + + if (wc.status != IBV_WC_SUCCESS) { + printf("comp status %d\n", wc.status); + return 1; + } + + return 0; +} + +int pp_close_ctx(struct umr_context *ctx) +{ + int i; + + invalidate_umr(ctx); + if (ibv_dereg_mr(ctx->umr)) { + fprintf(stderr, "Couldn't deregister UMR\n"); + return 1; + } + + if (ctx->mkey_list_container) { + if (ibv_exp_dealloc_mkey_list_memory(ctx->mkey_list_container)) { + fprintf(stderr, "Couldn't dealloc mkey list memory\n"); + return 1; + } + } + + if (ibv_destroy_qp(ctx->qp)) { + fprintf(stderr, "Couldn't destroy QP\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->cq)) { + fprintf(stderr, "Couldn't destroy CQ\n"); + return 1; + } + + for (i = 0; i < ctx->num_mrs; i++) + if (ibv_dereg_mr(ctx->mr_arr[i])) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx->channel) { + if (ibv_destroy_comp_channel(ctx->channel)) { + fprintf(stderr, "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + if (!use_contiguous_mr) + for (i = 0; i < ctx->num_mrs; i++) + free(ctx->buf[i]); + + free(ctx); + + return 0; +} + +static int pp_post_recv(struct umr_context *ctx, int n) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->umr->addr, + .length = ctx->umr->length, + .lkey = ctx->umr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = UMR_RECV_WRID, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) + if (ibv_post_recv(ctx->qp, &wr, &bad_wr)) + break; + + return i; +} + +static int pp_post_send(struct umr_context *ctx) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->umr->addr, + .length = ctx->umr->length, + .lkey = ctx->umr->lkey + }; + struct ibv_send_wr wr = { + .wr_id = UMR_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, + }; + struct ibv_send_wr *bad_wr; + + return ibv_post_send(ctx->qp, &wr, &bad_wr); +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s connect to server at \n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port (default 18515)\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -i, --ib-port= use port of IB device (default 1)\n"); + printf(" -s, --size= size of message to exchange (default 4096)\n"); + printf(" -m, --mtu= path MTU (default 1024)\n"); + printf(" -r, --rx-depth= number of receives to post at a time (default 500)\n"); + printf(" -n, --iters= number of exchanges (default 1000)\n"); + printf(" -l, --sl= service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -g, --gid-idx= local port gid index\n"); + printf(" -c, --contiguous-mr use contiguous mr\n"); + printf(" -t, --inline-recv= size of inline-recv\n"); + printf(" -x, --num-mrs create umr with num-mrs (default 3)\n"); + printf(" -u, --umr-non-inline-send use umr-non-inline send (default inline send)\n"); + printf(" -b, --repeated-block which memory block use for umr creation (default memory block)\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + struct umr_context *ctx; + struct umr_dest my_dest; + struct umr_dest *rem_dest; + struct timeval start, end; + char *ib_devname = NULL; + char *servername = NULL; + int port = 18515; + int ib_port = 1; + int size = 4096; + enum ibv_mtu mtu = IBV_MTU_1024; + int rx_depth = 500; + int iters = 1000; + int use_event = 0; + int routs; + int rcnt, scnt; + int num_cq_events = 0; + int sl = 0; + int gidx = -1; + char gid[INET6_ADDRSTRLEN]; + int inlr_recv = 0; + int umr_ninl_send = 0; + int num_mrs = 3; + int use_repeat_block = 0; + int rb_len = 50; + int rb_stride = 100; + int rb_count = 30; + struct ibv_exp_device_attr dattr; + int ret; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "contiguous-mr", .has_arg = 0, .val = 'c' }, + { .name = "inline-recv", .has_arg = 1, .val = 't' }, + { .name = "num-mrs", .has_arg = 1, .val = 'x' }, + { .name = "umr-non-inline-send", .has_arg = 0, .val = 'u' }, + { .name = "repeated-block", .has_arg = 0, .val = 'b' }, + { .name = "repeated-block-len", .has_arg = 1, .val = 'h' }, + { .name = "repeated-block-stide", .has_arg = 1, .val = 'k' }, + { .name = "repeated-block-count", .has_arg = 1, .val = 'o' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:ecg:t:x:ubh:k:o:", + long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port < 0 || port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtol(optarg, NULL, 0); + break; + + case 'm': + mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (mtu < 0) { + usage(argv[0]); + return 1; + } + break; + + case 'r': + rx_depth = strtol(optarg, NULL, 0); + break; + + case 'n': + iters = strtol(optarg, NULL, 0); + break; + + case 'l': + sl = strtol(optarg, NULL, 0); + break; + + case 'e': + ++use_event; + break; + + case 'g': + gidx = strtol(optarg, NULL, 0); + break; + + case 'c': + ++use_contiguous_mr; + break; + + case 't': + inlr_recv = strtol(optarg, NULL, 0); + break; + + case 'x': + num_mrs = strtol(optarg, NULL, 0); + break; + + case 'u': + ++umr_ninl_send; + break; + + case 'b': + ++use_repeat_block; + break; + + case 'h': + rb_len = strtol(optarg, NULL, 0); + break; + + case 'k': + rb_stride = strtol(optarg, NULL, 0); + break; + + case 'o': + rb_count = strtol(optarg, NULL, 0); + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) { + servername = strdupa(argv[optind]); + } else if (optind < argc) { + usage(argv[0]); + return 1; + } + + page_size = sysconf(_SC_PAGESIZE); + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, use_event, inlr_recv, num_mrs); + if (!ctx) + return 1; + + if (use_event) + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + + + if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + + my_dest.lid = ctx->portinfo.lid; + if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && + !my_dest.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + if (gidx >= 0) { + if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { + fprintf(stderr, "can't read sgid of index %d\n", gidx); + return 1; + } + } else { + memset(&my_dest.gid, 0, sizeof(my_dest.gid)); + } + + my_dest.qpn = ctx->qp->qp_num; + my_dest.psn = lrand48() & 0xffffff; + inet_ntop(AF_INET6, &my_dest.gid, gid, sizeof(gid)); + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", + my_dest.lid, my_dest.qpn, my_dest.psn, gid); + + + if (servername) + rem_dest = pp_client_exch_dest(servername, port, &my_dest); + else + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, + &my_dest, gidx); + + if (!rem_dest) + return 1; + + inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof(gid)); + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", + rem_dest->lid, rem_dest->qpn, rem_dest->psn, gid); + + if (servername) + if (pp_connect_ctx(ctx, ib_port, my_dest.psn, mtu, sl, rem_dest, + gidx)) + return 1; + + memset(&dattr, 0, sizeof(dattr)); + dattr.comp_mask |= IBV_EXP_DEVICE_ATTR_UMR; + ret = ibv_exp_query_device(ctx->context, &dattr); + if (ret) { + printf(" Couldn't query device for UMR capabilities.\n"); + return 1; + } + + if (create_umr(ctx, num_mrs, use_repeat_block, umr_ninl_send, dattr.umr_caps.max_klm_list_size, + size, rb_len, rb_stride, rb_count)) { + fprintf(stderr, "Failed to create umr\n"); + return 1; + } + + routs = pp_post_recv(ctx, ctx->rx_depth); + if (routs < ctx->rx_depth) { + fprintf(stderr, "Couldn't post receive (%d)\n", routs); + return 1; + } + + ctx->pending = UMR_RECV_WRID; + + if (servername) { + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending |= UMR_SEND_WRID; + } + + if (gettimeofday(&start, NULL)) { + perror("gettimeofday"); + return 1; + } + + rcnt = scnt = 0; + while (rcnt < iters || scnt < iters) { + if (use_event) { + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + return 1; + } + + ++num_cq_events; + + if (ev_cq != ctx->cq) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + return 1; + } + + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + } + + { + struct ibv_exp_wc wc[2]; + int ne, i; + + do { + ne = ibv_exp_poll_cq(ctx->cq, 2, wc, sizeof(wc[0])); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + } while (!use_event && ne < 1); + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + fprintf(stderr, "Failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc[i].status), + wc[i].status, (int) wc[i].wr_id); + return 1; + } + + switch ((int) wc[i].wr_id) { + case UMR_SEND_WRID: + ++scnt; + break; + + case UMR_RECV_WRID: + if (--routs <= 1) { + routs += pp_post_recv(ctx, ctx->rx_depth - routs); + if (routs < ctx->rx_depth) { + fprintf(stderr, + "Couldn't post receive (%d)\n", + routs); + return 1; + } + } + + ++rcnt; + break; + + default: + fprintf(stderr, "Completion for unknown wr_id %d\n", + (int) wc[i].wr_id); + return 1; + } + + ctx->pending &= ~(int) wc[i].wr_id; + if (scnt < iters && !ctx->pending) { + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending = UMR_RECV_WRID | + UMR_SEND_WRID; + } + } + } + } + + if (gettimeofday(&end, NULL)) { + perror("gettimeofday"); + return 1; + } + + { + float usec = (end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_usec - start.tv_usec); + long long bytes = (long long) size * num_mrs * iters * 2; + + printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", + bytes, usec / 1000000., bytes * 8. / usec); + printf("%d iters in %.2f seconds = %.2f usec/iter\n", + iters, usec / 1000000., usec / iters); + } + + ibv_ack_cq_events(ctx->cq, num_cq_events); + + if (pp_close_ctx(ctx)) + return 1; + + ibv_free_device_list(dev_list); + free(rem_dest); + + return 0; +} Index: contrib/ofed/libibverbs/examples/xsrq_pingpong.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/examples/xsrq_pingpong.c @@ -0,0 +1,1027 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2011 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pingpong.h" + +#define MSG_FORMAT "%04x:%06x:%06x:%06x:%04x:%32s" +#define MSG_SIZE 66 +#define MSG_SSCAN "%x:%x:%x:%x:%x:%s" +#define ADDR_FORMAT \ + "%8s: LID %04x, QPN RECV %06x SEND %06x, PSN %06x, SRQN %06x, GID %32s\n" +#define TERMINATION_FORMAT "%s" +#define TERMINATION_MSG_SIZE 4 +#define TERMINATION_MSG "END" +static int page_size; + +struct pingpong_dest { + union ibv_gid gid; + int lid; + int recv_qpn; + int send_qpn; + int recv_psn; + int send_psn; + int srqn; + int pp_cnt; + int sockfd; +}; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_xrcd *xrcd; + struct ibv_qp **recv_qp; + struct ibv_qp **send_qp; + struct pingpong_dest *rem_dest; + void *buf; + int lid; + int sl; + enum ibv_mtu mtu; + int ib_port; + int fd; + int size; + int num_clients; + int num_tests; + int use_event; + int gidx; +}; + +struct pingpong_context ctx; + + +static int open_device(char *ib_devname) +{ + struct ibv_device **dev_list; + int i = 0; + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + fprintf(stderr, "Failed to get IB devices list"); + return -1; + } + + if (ib_devname) { + for (; dev_list[i]; ++i) { + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + } + } + if (!dev_list[i]) { + fprintf(stderr, "IB device %s not found\n", + ib_devname ? ib_devname : ""); + return -1; + } + + ctx.context = ibv_open_device(dev_list[i]); + if (!ctx.context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(dev_list[i])); + return -1; + } + + ibv_free_device_list(dev_list); + return 0; +} + +static int create_qps(void) +{ + struct ibv_qp_init_attr_ex init; + struct ibv_qp_attr mod; + int i; + + for (i = 0; i < ctx.num_clients; ++i) { + + memset(&init, 0, sizeof init); + init.qp_type = IBV_QPT_XRC_RECV; + init.comp_mask = IBV_QP_INIT_ATTR_XRCD; + init.xrcd = ctx.xrcd; + + ctx.recv_qp[i] = ibv_create_qp_ex(ctx.context, &init); + if (!ctx.recv_qp[i]) { + fprintf(stderr, "Couldn't create recv QP[%d] errno %d\n", + i, errno); + return 1; + } + + mod.qp_state = IBV_QPS_INIT; + mod.pkey_index = 0; + mod.port_num = ctx.ib_port; + mod.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; + + if (ibv_modify_qp(ctx.recv_qp[i], &mod, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | + IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify recv QP[%d] to INIT\n", i); + return 1; + } + + memset(&init, 0, sizeof init); + init.qp_type = IBV_QPT_XRC_SEND; + init.send_cq = ctx.send_cq; + init.cap.max_send_wr = ctx.num_clients * ctx.num_tests; + init.cap.max_send_sge = 1; + init.comp_mask = IBV_QP_INIT_ATTR_PD; + init.pd = ctx.pd; + + ctx.send_qp[i] = ibv_create_qp_ex(ctx.context, &init); + if (!ctx.send_qp[i]) { + fprintf(stderr, "Couldn't create send QP[%d] errno %d\n", + i, errno); + return 1; + } + + mod.qp_state = IBV_QPS_INIT; + mod.pkey_index = 0; + mod.port_num = ctx.ib_port; + mod.qp_access_flags = 0; + + if (ibv_modify_qp(ctx.send_qp[i], &mod, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | + IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify send QP[%d] to INIT\n", i); + return 1; + } + } + + return 0; +} + +static int pp_init_ctx(char *ib_devname) +{ + struct ibv_srq_init_attr_ex attr; + struct ibv_xrcd_init_attr xrcd_attr; + struct ibv_port_attr port_attr; + + ctx.recv_qp = calloc(ctx.num_clients, sizeof *ctx.recv_qp); + ctx.send_qp = calloc(ctx.num_clients, sizeof *ctx.send_qp); + ctx.rem_dest = calloc(ctx.num_clients, sizeof *ctx.rem_dest); + if (!ctx.recv_qp || !ctx.send_qp || !ctx.rem_dest) + return 1; + + if (open_device(ib_devname)) { + fprintf(stderr, "Failed to open device\n"); + return 1; + } + + if (pp_get_port_info(ctx.context, ctx.ib_port, &port_attr)) { + fprintf(stderr, "Failed to get SLID\n"); + return 1; + } + + ctx.lid = port_attr.lid; + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + ctx.buf = memalign(page_size, ctx.size); + if (!ctx.buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + return 1; + } + + memset(ctx.buf, 0, ctx.size); + + if (ctx.use_event) { + ctx.channel = ibv_create_comp_channel(ctx.context); + if (!ctx.channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + return 1; + } + } + + ctx.pd = ibv_alloc_pd(ctx.context); + if (!ctx.pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + return 1; + } + + ctx.mr = ibv_reg_mr(ctx.pd, ctx.buf, ctx.size, IBV_ACCESS_LOCAL_WRITE); + if (!ctx.mr) { + fprintf(stderr, "Couldn't register MR\n"); + return 1; + } + + ctx.fd = open("/tmp/xrc_domain", O_RDONLY | O_CREAT, S_IRUSR | S_IRGRP); + if (ctx.fd < 0) { + fprintf(stderr, + "Couldn't create the file for the XRC Domain " + "but not stopping %d\n", errno); + ctx.fd = -1; + } + + memset(&xrcd_attr, 0, sizeof xrcd_attr); + xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; + xrcd_attr.fd = ctx.fd; + xrcd_attr.oflags = O_CREAT; + ctx.xrcd = ibv_open_xrcd(ctx.context, &xrcd_attr); + if (!ctx.xrcd) { + fprintf(stderr, "Couldn't Open the XRC Domain %d\n", errno); + return 1; + } + + ctx.recv_cq = ibv_create_cq(ctx.context, ctx.num_clients, &ctx.recv_cq, + ctx.channel, 0); + if (!ctx.recv_cq) { + fprintf(stderr, "Couldn't create recv CQ\n"); + return 1; + } + + if (ctx.use_event) { + if (ibv_req_notify_cq(ctx.recv_cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + } + + ctx.send_cq = ibv_create_cq(ctx.context, ctx.num_clients, NULL, NULL, 0); + if (!ctx.send_cq) { + fprintf(stderr, "Couldn't create send CQ\n"); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.attr.max_wr = ctx.num_clients; + attr.attr.max_sge = 1; + attr.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_XRCD | + IBV_SRQ_INIT_ATTR_CQ | IBV_SRQ_INIT_ATTR_PD; + attr.srq_type = IBV_SRQT_XRC; + attr.xrcd = ctx.xrcd; + attr.cq = ctx.recv_cq; + attr.pd = ctx.pd; + + ctx.srq = ibv_create_srq_ex(ctx.context, &attr); + if (!ctx.srq) { + fprintf(stderr, "Couldn't create SRQ\n"); + return 1; + } + + if (create_qps()) + return 1; + + return 0; +} + +static int recv_termination_ack(int index) +{ + char msg[TERMINATION_MSG_SIZE]; + int n = 0, r; + int sockfd = ctx.rem_dest[index].sockfd; + + while (n < TERMINATION_MSG_SIZE) { + r = read(sockfd, msg + n, TERMINATION_MSG_SIZE - n); + if (r < 0) { + perror("client read"); + fprintf(stderr, + "%d/%d: Couldn't read remote termination ack\n", + n, TERMINATION_MSG_SIZE); + return 1; + } + n += r; + } + + if (strcmp(msg, TERMINATION_MSG)) { + fprintf(stderr, "Invalid termination ack was accepted\n"); + return 1; + } + + return 0; +} + +static int send_termination_ack(int index) +{ + char msg[TERMINATION_MSG_SIZE]; + int sockfd = ctx.rem_dest[index].sockfd; + + sprintf(msg, TERMINATION_FORMAT, TERMINATION_MSG); + + if (write(sockfd, msg, TERMINATION_MSG_SIZE) != TERMINATION_MSG_SIZE) { + fprintf(stderr, "Couldn't send termination ack\n"); + return 1; + } + + return 0; +} + +static int pp_client_termination() +{ + if (send_termination_ack(0)) + return 1; + if (recv_termination_ack(0)) + return 1; + + return 0; +} + +static int pp_server_termination() +{ + int i; + + for (i = 0; i < ctx.num_clients; i++) { + if (recv_termination_ack(i)) + return 1; + } + + for (i = 0; i < ctx.num_clients; i++) { + if (send_termination_ack(i)) + return 1; + } + + return 0; +} + +static int send_local_dest(int sockfd, int index) +{ + char msg[MSG_SIZE]; + char gid[33]; + uint32_t srq_num; + union ibv_gid local_gid; + ctx.rem_dest[index].recv_psn = lrand48() & 0xffffff; + + if (ctx.gidx >= 0) { + if (ibv_query_gid(ctx.context, ctx.ib_port, ctx.gidx, + &local_gid)) { + fprintf(stderr, "can't read sgid of index %d\n", + ctx.gidx); + return -1; + } + } else { + memset(&local_gid, 0, sizeof(local_gid)); + } + + if (ibv_get_srq_num(ctx.srq, &srq_num)) { + fprintf(stderr, "Couldn't get SRQ num\n"); + return -1; + } + + gid_to_wire_gid(&local_gid, gid); + printf(ADDR_FORMAT, "local", ctx.lid, ctx.recv_qp[index]->qp_num, + ctx.send_qp[index]->qp_num, ctx.rem_dest[index].recv_psn, + srq_num, gid); + + sprintf(msg, MSG_FORMAT, ctx.lid, ctx.recv_qp[index]->qp_num, + ctx.send_qp[index]->qp_num, ctx.rem_dest[index].recv_psn, + srq_num, gid); + + if (write(sockfd, msg, MSG_SIZE) != MSG_SIZE) { + fprintf(stderr, "Couldn't send local address\n"); + return -1; + } + + return 0; +} + +static int recv_remote_dest(int sockfd, int index) +{ + char msg[MSG_SIZE]; + char gid[33]; + struct pingpong_dest *rem_dest; + int n = 0, r; + + while (n < MSG_SIZE) { + r = read(sockfd, msg + n, MSG_SIZE - n); + if (r < 0) { + perror("client read"); + fprintf(stderr, + "%d/%d: Couldn't read remote address [%d]\n", + n, MSG_SIZE, index); + return -1; + } + n += r; + } + + rem_dest = &ctx.rem_dest[index]; + sscanf(msg, MSG_SSCAN, &rem_dest->lid, &rem_dest->recv_qpn, + &rem_dest->send_qpn, &rem_dest->send_psn, &rem_dest->srqn, gid); + + wire_gid_to_gid(gid, &rem_dest->gid); + printf(ADDR_FORMAT, "remote", rem_dest->lid, rem_dest->recv_qpn, + rem_dest->send_qpn, rem_dest->send_psn, rem_dest->srqn, gid); + + rem_dest->sockfd = sockfd; + return 0; +} + +static int connect_qps(int index) +{ + struct ibv_qp_attr attr; + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTR; + attr.dest_qp_num = ctx.rem_dest[index].send_qpn; + attr.path_mtu = ctx.mtu; + attr.rq_psn = ctx.rem_dest[index].send_psn; + attr.min_rnr_timer = 12; + attr.ah_attr.dlid = ctx.rem_dest[index].lid; + attr.ah_attr.sl = ctx.sl; + attr.ah_attr.port_num = ctx.ib_port; + + if (ctx.rem_dest[index].gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 5; + attr.ah_attr.grh.dgid = ctx.rem_dest[index].gid; + attr.ah_attr.grh.sgid_index = ctx.gidx; + } + + if (ibv_modify_qp(ctx.recv_qp[index], &attr, + IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify recv QP[%d] to RTR\n", index); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.sq_psn = ctx.rem_dest[index].recv_psn; + + if (ibv_modify_qp(ctx.recv_qp[index], &attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_SQ_PSN)) { + fprintf(stderr, "Failed to modify recv QP[%d] to RTS\n", index); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTR; + attr.dest_qp_num = ctx.rem_dest[index].recv_qpn; + attr.path_mtu = ctx.mtu; + attr.rq_psn = ctx.rem_dest[index].send_psn; + attr.ah_attr.dlid = ctx.rem_dest[index].lid; + attr.ah_attr.sl = ctx.sl; + attr.ah_attr.port_num = ctx.ib_port; + + if (ctx.rem_dest[index].gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 5; + attr.ah_attr.grh.dgid = ctx.rem_dest[index].gid; + attr.ah_attr.grh.sgid_index = ctx.gidx; + } + + if (ibv_modify_qp(ctx.send_qp[index], &attr, + IBV_QP_STATE | IBV_QP_AV | + IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN)) { + fprintf(stderr, "Failed to modify send QP[%d] to RTR\n", index); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = ctx.rem_dest[index].recv_psn; + + if (ctx.rem_dest[index].gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 5; + attr.ah_attr.grh.dgid = ctx.rem_dest[index].gid; + attr.ah_attr.grh.sgid_index = ctx.gidx; + } + + if (ibv_modify_qp(ctx.send_qp[index], &attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_SQ_PSN | + IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify send QP[%d] to RTS\n", index); + return 1; + } + + return 0; +} + +static int pp_client_connect(const char *servername, int port) +{ + struct addrinfo *res, *t; + char *service; + int ret; + int sockfd = -1; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + if (asprintf(&service, "%d", port) < 0) + return 1; + + ret = getaddrinfo(servername, service, &hints, &res); + if (ret < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(ret), servername, port); + free(service); + return 1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return 1; + } + + if (send_local_dest(sockfd, 0)) + return 1; + + if (recv_remote_dest(sockfd, 0)) + return 1; + + if (connect_qps(0)) + return 1; + + return 0; +} + +static int pp_server_connect(int port) +{ + struct addrinfo *res, *t; + char *service; + int ret, i, n; + int sockfd = -1, connfd; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + if (asprintf(&service, "%d", port) < 0) + return 1; + + ret = getaddrinfo(NULL, service, &hints, &res); + if (ret < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(ret), port); + free(service); + return 1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return 1; + } + + listen(sockfd, ctx.num_clients); + + for (i = 0; i < ctx.num_clients; i++) { + connfd = accept(sockfd, NULL, 0); + if (connfd < 0) { + fprintf(stderr, "accept() failed for client %d\n", i); + return 1; + } + + if (recv_remote_dest(connfd, i)) + return 1; + + if (send_local_dest(connfd, i)) + return 1; + + if (connect_qps(i)) + return 1; + } + + close(sockfd); + return 0; +} + + +static int pp_close_ctx(void) +{ + int i; + + for (i = 0; i < ctx.num_clients; ++i) { + + if (ibv_destroy_qp(ctx.send_qp[i])) { + fprintf(stderr, "Couldn't destroy INI QP[%d]\n", i); + return 1; + } + + if (ibv_destroy_qp(ctx.recv_qp[i])) { + fprintf(stderr, "Couldn't destroy TGT QP[%d]\n", i); + return 1; + } + + if (ctx.rem_dest[i].sockfd) + close(ctx.rem_dest[i].sockfd); + } + + if (ibv_destroy_srq(ctx.srq)) { + fprintf(stderr, "Couldn't destroy SRQ\n"); + return 1; + } + + if (ctx.xrcd && ibv_close_xrcd(ctx.xrcd)) { + fprintf(stderr, "Couldn't close the XRC Domain\n"); + return 1; + } + if (ctx.fd >= 0 && close(ctx.fd)) { + fprintf(stderr, "Couldn't close the file for the XRC Domain\n"); + return 1; + } + + if (ibv_destroy_cq(ctx.send_cq)) { + fprintf(stderr, "Couldn't destroy send CQ\n"); + return 1; + } + + if (ibv_destroy_cq(ctx.recv_cq)) { + fprintf(stderr, "Couldn't destroy recv CQ\n"); + return 1; + } + + if (ibv_dereg_mr(ctx.mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx.pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx.channel) { + if (ibv_destroy_comp_channel(ctx.channel)) { + fprintf(stderr, + "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx.context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + free(ctx.buf); + free(ctx.rem_dest); + free(ctx.send_qp); + free(ctx.recv_qp); + return 0; +} + +static int pp_post_recv(int cnt) +{ + struct ibv_sge sge; + struct ibv_recv_wr wr, *bad_wr; + + sge.addr = (uintptr_t) ctx.buf; + sge.length = ctx.size; + sge.lkey = ctx.mr->lkey; + + wr.next = NULL; + wr.wr_id = (uintptr_t) &ctx; + wr.sg_list = &sge; + wr.num_sge = 1; + + while (cnt--) { + if (ibv_post_srq_recv(ctx.srq, &wr, &bad_wr)) { + fprintf(stderr, "Failed to post receive to SRQ\n"); + return 1; + } + } + return 0; +} + +/* + * Send to each client round robin on each set of xrc send/recv qp. + * Generate a completion on the last send. + */ +static int pp_post_send(int index) +{ + struct ibv_sge sge; + struct ibv_send_wr wr, *bad_wr; + int qpi; + + sge.addr = (uintptr_t) ctx.buf; + sge.length = ctx.size; + sge.lkey = ctx.mr->lkey; + + wr.wr_id = (uintptr_t) index; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.qp_type.xrc.remote_srqn = ctx.rem_dest[index].srqn; + + qpi = (index + ctx.rem_dest[index].pp_cnt) % ctx.num_clients; + wr.send_flags = (++ctx.rem_dest[index].pp_cnt >= ctx.num_tests) ? + IBV_SEND_SIGNALED : 0; + + return ibv_post_send(ctx.send_qp[qpi], &wr, &bad_wr); +} + +static int find_qp(int qpn) +{ + int i; + + if (ctx.num_clients == 1) + return 0; + + for (i = 0; i < ctx.num_clients; ++i) + if (ctx.recv_qp[i]->qp_num == qpn) + return i; + + fprintf(stderr, "Unable to find qp %x\n", qpn); + return 0; +} + +static int get_cq_event(void) +{ + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx.channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + return 1; + } + + if (ev_cq != ctx.recv_cq) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + return 1; + } + + if (ibv_req_notify_cq(ctx.recv_cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + + return 0; +} + +static void init(void) +{ + srand48(getpid() * time(NULL)); + + ctx.size = 4096; + ctx.ib_port = 1; + ctx.num_clients = 1; + ctx.num_tests = 5; + ctx.mtu = IBV_MTU_1024; + ctx.sl = 0; + ctx.gidx = -1; +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s connect to server at \n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port= listen on/connect to port (default 18515)\n"); + printf(" -d, --ib-dev= use IB device (default first device found)\n"); + printf(" -i, --ib-port= use port of IB device (default 1)\n"); + printf(" -s, --size= size of message to exchange (default 4096)\n"); + printf(" -m, --mtu= path MTU (default 1024)\n"); + printf(" -c, --clients= number of clients (on server only, default 1)\n"); + printf(" -n, --num_tests= number of tests per client (default 5)\n"); + printf(" -l, --sl= service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -g, --gid-idx= local port gid index\n"); +} + +int main(int argc, char *argv[]) +{ + char *ib_devname = NULL; + char *servername = NULL; + int port = 18515; + int i, total, cnt = 0; + int ne, qpi, num_cq_events = 0; + struct ibv_wc wc; + + init(); + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "clients", .has_arg = 1, .val = 'c' }, + { .name = "num_tests", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { 0 } + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:c:n:l:eg:", long_options, + NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port < 0 || port > 65535) { + usage(argv[0]); + return 1; + } + break; + case 'd': + ib_devname = strdupa(optarg); + break; + case 'i': + ctx.ib_port = strtol(optarg, NULL, 0); + if (ctx.ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + case 's': + ctx.size = strtol(optarg, NULL, 0); + break; + case 'm': + ctx.mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (ctx.mtu < 0) { + usage(argv[0]); + return 1; + } + break; + case 'c': + ctx.num_clients = strtol(optarg, NULL, 0); + break; + case 'n': + ctx.num_tests = strtol(optarg, NULL, 0); + break; + case 'l': + ctx.sl = strtol(optarg, NULL, 0); + break; + case 'g': + ctx.gidx = strtol(optarg, NULL, 0); + break; + case 'e': + ctx.use_event = 1; + break; + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) { + servername = strdupa(argv[optind]); + ctx.num_clients = 1; + } else if (optind < argc) { + usage(argv[0]); + return 1; + } + + page_size = sysconf(_SC_PAGESIZE); + + if (pp_init_ctx(ib_devname)) + return 1; + + if (pp_post_recv(ctx.num_clients)) { + fprintf(stderr, "Couldn't post receives\n"); + return 1; + } + + if (servername) { + if (pp_client_connect(servername, port)) + return 1; + } else { + if (pp_server_connect(port)) + return 1; + + for (i = 0; i < ctx.num_clients; i++) + pp_post_send(i); + } + + total = ctx.num_clients * ctx.num_tests; + while (cnt < total) { + if (ctx.use_event) { + if (get_cq_event()) + return 1; + + ++num_cq_events; + } + + do { + ne = ibv_poll_cq(ctx.recv_cq, 1, &wc); + if (ne < 0) { + fprintf(stderr, "Error polling cq %d\n", ne); + return 1; + } else if (ne == 0) { + break; + } + + if (wc.status) { + fprintf(stderr, "Work completion error %d\n", wc.status); + return 1; + } + + pp_post_recv(ne); + qpi = find_qp(wc.qp_num); + if (ctx.rem_dest[qpi].pp_cnt < ctx.num_tests) + pp_post_send(qpi); + cnt += ne; + } while (ne > 0); + } + + for (cnt = 0; cnt < ctx.num_clients; cnt += ne) { + ne = ibv_poll_cq(ctx.send_cq, 1, &wc); + if (ne < 0) { + fprintf(stderr, "Error polling cq %d\n", ne); + return 1; + } + } + + if (ctx.use_event) + ibv_ack_cq_events(ctx.recv_cq, num_cq_events); + + /* Process should get an ack from the daemon to close its resources to + * make sure latest daemon's response sent via its target QP destined + * to an XSRQ created by another client won't be lost. + * Failure to do so may cause the other client to wait for that sent + * message forever. See comment on pp_post_send. + */ + if (servername) { + if (pp_client_termination()) + return 1; + } else if (pp_server_termination()) { + return 1; + } + + if (pp_close_ctx()) + return 1; + + printf("success\n"); + return 0; +} Index: contrib/ofed/libibverbs/include/infiniband/arch.h =================================================================== --- contrib/ofed/libibverbs/include/infiniband/arch.h +++ contrib/ofed/libibverbs/include/infiniband/arch.h @@ -34,17 +34,25 @@ #define INFINIBAND_ARCH_H #include -#include -#include +#include +#include -#if __BYTE_ORDER == __LITTLE_ENDIAN +#ifdef htonll +#undef htonll +#endif + +#ifdef ntohll +#undef ntohll +#endif + +#if BYTE_ORDER == LITTLE_ENDIAN static inline uint64_t htonll(uint64_t x) { return bswap_64(x); } static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); } -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif BYTE_ORDER == BIG_ENDIAN static inline uint64_t htonll(uint64_t x) { return x; } static inline uint64_t ntohll(uint64_t x) { return x; } #else -#error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN +#error BYTE_ORDER is neither LITTLE_ENDIAN nor BIG_ENDIAN #endif /* @@ -68,24 +76,25 @@ #define rmb() mb() #define wmb() asm volatile("" ::: "memory") #define wc_wmb() mb() +#define nc_wmb() wmb() #elif defined(__x86_64__) -/* - * Only use lfence for mb() and rmb() because we don't care about - * ordering against non-temporal stores (for now at least). - */ -#define mb() asm volatile("lfence" ::: "memory") +#define mb() asm volatile("" ::: "memory") #define rmb() mb() #define wmb() asm volatile("" ::: "memory") #define wc_wmb() asm volatile("sfence" ::: "memory") +#define nc_wmb() wmb() +#define WC_AUTO_EVICT_SIZE 64 #elif defined(__PPC64__) #define mb() asm volatile("sync" ::: "memory") #define rmb() asm volatile("lwsync" ::: "memory") -#define wmb() mb() -#define wc_wmb() wmb() +#define wmb() rmb() +#define wc_wmb() mb() +#define nc_wmb() mb() +#define WC_AUTO_EVICT_SIZE 64 #elif defined(__ia64__) @@ -93,6 +102,7 @@ #define rmb() mb() #define wmb() mb() #define wc_wmb() asm volatile("fwb" ::: "memory") +#define nc_wmb() wmb() #elif defined(__PPC__) @@ -100,6 +110,7 @@ #define rmb() mb() #define wmb() mb() #define wc_wmb() wmb() +#define nc_wmb() wmb() #elif defined(__sparc_v9__) @@ -107,6 +118,7 @@ #define rmb() asm volatile("membar #LoadLoad" ::: "memory") #define wmb() asm volatile("membar #StoreStore" ::: "memory") #define wc_wmb() wmb() +#define nc_wmb() wmb() #elif defined(__sparc__) @@ -114,6 +126,15 @@ #define rmb() mb() #define wmb() mb() #define wc_wmb() wmb() +#define nc_wmb() wmb() + +#elif defined(__aarch64__) + +#define mb() asm volatile("" ::: "memory") +#define rmb() mb() +#define wmb() mb() +#define wc_wmb() wmb() +#define nc_wmb() wmb() #else @@ -123,7 +144,14 @@ #define rmb() mb() #define wmb() mb() #define wc_wmb() wmb() +#define nc_wmb() wmb() + +#endif +#ifdef WC_AUTO_EVICT_SIZE +static inline int wc_auto_evict_size(void) { return WC_AUTO_EVICT_SIZE; }; +#else +static inline int wc_auto_evict_size(void) { return 0; }; #endif #endif /* INFINIBAND_ARCH_H */ Index: contrib/ofed/libibverbs/include/infiniband/driver.h =================================================================== --- contrib/ofed/libibverbs/include/infiniband/driver.h +++ contrib/ofed/libibverbs/include/infiniband/driver.h @@ -53,10 +53,57 @@ */ #define IBV_DEVICE_LIBRARY_EXTENSION rdmav2 +enum verbs_xrcd_mask { + VERBS_XRCD_HANDLE = 1 << 0, + VERBS_XRCD_RESERVED = 1 << 1 +}; + +struct verbs_xrcd { + struct ibv_xrcd xrcd; + uint32_t comp_mask; + uint32_t handle; +}; + +enum verbs_srq_mask { + VERBS_SRQ_TYPE = 1 << 0, + VERBS_SRQ_XRCD = 1 << 1, + VERBS_SRQ_CQ = 1 << 2, + VERBS_SRQ_NUM = 1 << 3, + VERBS_SRQ_RESERVED = 1 << 4 +}; + +struct verbs_srq { + struct ibv_srq srq; + uint32_t comp_mask; + enum ibv_srq_type srq_type; + struct verbs_xrcd *xrcd; + struct ibv_cq *cq; + uint32_t srq_num; +}; + +enum verbs_qp_mask { + VERBS_QP_XRCD = 1 << 0, + VERBS_QP_RESERVED = 1 << 1 +}; + +struct verbs_mw { + struct ibv_mw mw; + uint32_t handle; + enum ibv_mw_type type; +}; + +struct verbs_qp { + struct ibv_qp qp; + uint32_t comp_mask; + struct verbs_xrcd *xrcd; +}; typedef struct ibv_device *(*ibv_driver_init_func)(const char *uverbs_sys_path, int abi_version); +typedef struct verbs_device *(*verbs_driver_init_func)(const char *uverbs_sys_path, + int abi_version); void ibv_register_driver(const char *name, ibv_driver_init_func init_func); +void verbs_register_driver(const char *name, verbs_driver_init_func init_func); int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, size_t cmd_size, struct ibv_get_context_resp *resp, size_t resp_size); @@ -75,6 +122,13 @@ struct ibv_alloc_pd *cmd, size_t cmd_size, struct ibv_alloc_pd_resp *resp, size_t resp_size); int ibv_cmd_dealloc_pd(struct ibv_pd *pd); +int ibv_cmd_open_xrcd(struct ibv_context *context, struct verbs_xrcd *xrcd, + int vxrcd_size, + struct ibv_xrcd_init_attr *attr, + struct ibv_open_xrcd *cmd, size_t cmd_size, + struct ibv_open_xrcd_resp *resp, + size_t resp_size); +int ibv_cmd_close_xrcd(struct verbs_xrcd *xrcd); #define IBV_CMD_REG_MR_HAS_RESP_PARAMS int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access, @@ -82,6 +136,12 @@ size_t cmd_size, struct ibv_reg_mr_resp *resp, size_t resp_size); int ibv_cmd_dereg_mr(struct ibv_mr *mr); +int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type, + struct verbs_mw *mw, struct ibv_alloc_mw *cmd, + size_t cmd_size, + struct ibv_alloc_mw_resp *resp, size_t resp_size); +int ibv_cmd_dealloc_mw(struct verbs_mw *mw, + struct ibv_dealloc_mw *cmd, size_t cmd_size); int ibv_cmd_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector, struct ibv_cq *cq, @@ -99,11 +159,11 @@ struct ibv_srq *srq, struct ibv_srq_init_attr *attr, struct ibv_create_srq *cmd, size_t cmd_size, struct ibv_create_srq_resp *resp, size_t resp_size); -int ibv_cmd_create_xrc_srq(struct ibv_pd *pd, - struct ibv_srq *srq, struct ibv_srq_init_attr *attr, - uint32_t xrc_domain, uint32_t xrc_cq, - struct ibv_create_xrc_srq *cmd, size_t cmd_size, - struct ibv_create_srq_resp *resp, size_t resp_size); +int ibv_cmd_create_srq_ex(struct ibv_context *context, + struct verbs_srq *srq, int vsrq_sz, + struct ibv_srq_init_attr_ex *attr_ex, + struct ibv_create_xsrq *cmd, size_t cmd_size, + struct ibv_create_srq_resp *resp, size_t resp_size); int ibv_cmd_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask, @@ -117,6 +177,11 @@ struct ibv_qp *qp, struct ibv_qp_init_attr *attr, struct ibv_create_qp *cmd, size_t cmd_size, struct ibv_create_qp_resp *resp, size_t resp_size); +int ibv_cmd_open_qp(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_qp_open_attr *attr, + struct ibv_open_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size); int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *qp_attr, int attr_mask, struct ibv_qp_init_attr *qp_init_attr, @@ -137,24 +202,18 @@ int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); +struct ibv_exp_flow *ibv_exp_cmd_create_flow(struct ibv_qp *qp, + struct ibv_exp_flow_attr *flow_attr); +int ibv_exp_cmd_destroy_flow(struct ibv_exp_flow *flow_id); +struct ibv_flow *ibv_cmd_create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr); +int ibv_cmd_destroy_flow(struct ibv_flow *flow_id); + int ibv_dontfork_range(void *base, size_t size); int ibv_dofork_range(void *base, size_t size); -int ibv_cmd_open_xrc_domain(struct ibv_context *context, int fd, int oflag, - struct ibv_xrc_domain *d, - struct ibv_open_xrc_domain_resp *resp, - size_t resp_size); -int ibv_cmd_close_xrc_domain(struct ibv_xrc_domain *d); -int ibv_cmd_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_rcv_qpn); -int ibv_cmd_modify_xrc_rcv_qp(struct ibv_xrc_domain *d, uint32_t xrc_rcv_qpn, - struct ibv_qp_attr *attr, int attr_mask); -int ibv_cmd_query_xrc_rcv_qp(struct ibv_xrc_domain *d, uint32_t xrc_rcv_qpn, - struct ibv_qp_attr *attr, int attr_mask, - struct ibv_qp_init_attr *init_attr); -int ibv_cmd_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num); -int ibv_cmd_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num); +void ibv_cmd_query_device_assign(struct ibv_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_query_device_resp *resp); /* * sysfs helper functions @@ -164,9 +223,14 @@ int ibv_read_sysfs_file(const char *dir, const char *file, char *buf, size_t size); -int ibv_resolve_eth_gid(const struct ibv_pd *pd, uint8_t port_num, - union ibv_gid *dgid, uint8_t sgid_index, - uint8_t mac[], uint16_t *vlan, uint8_t *tagged, - uint8_t *is_mcast); +static inline int verbs_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) +{ + struct verbs_srq *vsrq = container_of(srq, struct verbs_srq, srq); + if (vsrq->comp_mask & VERBS_SRQ_NUM) { + *srq_num = vsrq->srq_num; + return 0; + } + return ENOSYS; +} #endif /* INFINIBAND_DRIVER_H */ Index: contrib/ofed/libibverbs/include/infiniband/driver_exp.h =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/include/infiniband/driver_exp.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_DRIVER_EXP_H +#define INFINIBAND_DRIVER_EXP_H + +#include +#include +#include + +int ibv_exp_cmd_query_device(struct ibv_context *context, + struct ibv_exp_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_exp_query_device *cmd, size_t cmd_size); +int ibv_exp_cmd_create_qp(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_exp_qp_init_attr *attr_exp, + void *cmd_buf, size_t lib_cmd_size, size_t drv_cmd_size, + void *resp_buf, size_t lib_resp_size, size_t drv_resp_size, + int force_exp); +int ibv_exp_cmd_create_dct(struct ibv_context *context, + struct ibv_exp_dct *dct, + struct ibv_exp_dct_init_attr *attr, + struct ibv_exp_create_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_create_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz); +int ibv_exp_cmd_destroy_dct(struct ibv_context *context, + struct ibv_exp_dct *dct, + struct ibv_exp_destroy_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_destroy_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz); +int ibv_exp_cmd_query_dct(struct ibv_context *context, + struct ibv_exp_query_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_query_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz, + struct ibv_exp_dct_attr *attr); +int ibv_exp_cmd_arm_dct(struct ibv_context *context, + struct ibv_exp_arm_attr *attr, + struct ibv_exp_arm_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_arm_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz); +int ibv_exp_cmd_modify_cq(struct ibv_cq *cq, + struct ibv_exp_cq_attr *attr, + int attr_mask, + struct ibv_exp_modify_cq *cmd, size_t cmd_size); +int ibv_exp_cmd_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector, struct ibv_cq *cq, + struct ibv_exp_create_cq *cmd, size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_create_cq_resp *resp, size_t lib_resp_sz, size_t drv_resp_sz, + struct ibv_exp_cq_init_attr *attr); +int ibv_exp_cmd_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, + uint64_t attr_mask, struct ibv_exp_modify_qp *cmd, + size_t cmd_size); +int ibv_exp_cmd_create_mr(struct ibv_exp_create_mr_in *in, struct ibv_mr *mr, + struct ibv_exp_create_mr *cmd, size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_create_mr_resp *resp, size_t lib_resp_sz, size_t drv_resp_sz); +int ibv_exp_cmd_query_mkey(struct ibv_context *context, + struct ibv_mr *mr, + struct ibv_exp_mkey_attr *mkey_attr, + struct ibv_exp_query_mkey *cmd, size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_query_mkey_resp *resp, size_t lib_resp_sz, size_t drv_resp_sz); +int ibv_cmd_exp_reg_mr(const struct ibv_exp_reg_mr_in *mr_init_attr, + uint64_t hca_va, struct ibv_mr *mr, + struct ibv_exp_reg_mr *cmd, + size_t cmd_size, + struct ibv_exp_reg_mr_resp *resp, + size_t resp_size); +int ibv_cmd_exp_prefetch_mr(struct ibv_mr *mr, + struct ibv_exp_prefetch_attr *attr); +int ibv_exp_cmd_create_wq(struct ibv_context *context, + struct ibv_exp_wq_init_attr *wq_init_attr, + struct ibv_exp_wq *wq, + struct ibv_exp_create_wq *cmd, + size_t cmd_core_size, + size_t cmd_size, + struct ibv_exp_create_wq_resp *resp, + size_t resp_core_size, + size_t resp_size); +int ibv_exp_cmd_destroy_wq(struct ibv_exp_wq *wq); +int ibv_exp_cmd_modify_wq(struct ibv_exp_wq *wq, struct ibv_exp_wq_attr *attr, + struct ib_exp_modify_wq *cmd, size_t cmd_size); +int ibv_exp_cmd_create_rwq_ind_table(struct ibv_context *context, + struct ibv_exp_rwq_ind_table_init_attr *init_attr, + struct ibv_exp_rwq_ind_table *rwq_ind_table, + struct ibv_exp_create_rwq_ind_table *cmd, + size_t cmd_core_size, + size_t cmd_size, + struct ibv_exp_create_rwq_ind_table_resp *resp, + size_t resp_core_size, + size_t resp_size); +int ibv_exp_cmd_destroy_rwq_ind_table(struct ibv_exp_rwq_ind_table *rwq_ind_table); +int ibv_exp_cmd_rereg_mr(struct ibv_mr *mr, uint32_t flags, void *addr, + size_t length, uint64_t hca_va, int access, + struct ibv_pd *pd, struct ibv_exp_rereg_mr_attr *attr, + struct ibv_exp_rereg_mr *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_rereg_mr_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz); +/* + * ibv_exp_cmd_getenv + * + * @context: context to the device + * @name: the name of the variable to read + * @value: pointer where the value of the variable will be written + * @n: number of bytes pointed to by val + * + * return: 0 success + * < 0 varaible was not found + > 0 variable found but not enuogh space provided. requied space is the value returned. + */ +int ibv_exp_cmd_getenv(struct ibv_context *context, + const char *name, char *value, size_t n); + + +#endif /* INFINIBAND_DRIVER_EXP_H */ Index: contrib/ofed/libibverbs/include/infiniband/kern-abi.h =================================================================== --- contrib/ofed/libibverbs/include/infiniband/kern-abi.h +++ contrib/ofed/libibverbs/include/infiniband/kern-abi.h @@ -45,9 +45,11 @@ /* * The minimum and maximum kernel ABI that we can handle. */ -#define IB_USER_VERBS_MIN_ABI_VERSION 1 +#define IB_USER_VERBS_MIN_ABI_VERSION 3 #define IB_USER_VERBS_MAX_ABI_VERSION 6 +#define IB_USER_VERBS_CMD_THRESHOLD 50 + enum { IB_USER_VERBS_CMD_GET_CONTEXT, IB_USER_VERBS_CMD_QUERY_DEVICE, @@ -86,14 +88,26 @@ IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, IB_USER_VERBS_CMD_POST_SRQ_RECV, - IB_USER_VERBS_CMD_CREATE_XRC_SRQ, - IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN, - IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN, - IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP, - IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP, - IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP, - IB_USER_VERBS_CMD_REG_XRC_RCV_QP, - IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP, + IB_USER_VERBS_CMD_OPEN_XRCD, + IB_USER_VERBS_CMD_CLOSE_XRCD, + IB_USER_VERBS_CMD_CREATE_XSRQ, + IB_USER_VERBS_CMD_OPEN_QP, +}; + + +#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff +#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u +#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 + + +#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80ul + + +enum { + IB_USER_VERBS_CMD_CREATE_FLOW = (IB_USER_VERBS_CMD_FLAG_EXTENDED << + IB_USER_VERBS_CMD_FLAGS_SHIFT) + + IB_USER_VERBS_CMD_THRESHOLD, + IB_USER_VERBS_CMD_DESTROY_FLOW }; /* @@ -107,10 +121,44 @@ * different between 32-bit and 64-bit architectures. */ +struct hdr { + __u32 command; + __u16 in_words; + __u16 out_words; +}; + +struct response_hdr { + __u64 response; +}; + +struct ex_hdr { + struct { + __u32 command; + __u16 in_words; + __u16 out_words; + }; + struct { + __u64 response; + }; + struct { + __u16 provider_in_words; + __u16 provider_out_words; + __u32 reserved; + }; +}; + +enum ibv_event_rsc_type { + IBV_EVENT_RSC_CQ, + IBV_EVENT_RSC_QP, + IBV_EVENT_RSC_DCT, + IBV_EVENT_RSC_SRQ, + IBV_EVENT_RSC_DEVICE, +}; + struct ibv_kern_async_event { __u64 element; __u32 event_type; - __u32 reserved; + __u32 rsc_type; }; struct ibv_comp_event { @@ -125,6 +173,13 @@ * the rest of the command struct based on these value. */ +#define IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, field) \ + ((ibv_type *)((void *)(ex_ptr) + offsetof(ex_type, \ + field) + sizeof((ex_ptr)->field))) + +#define IBV_RESP_TO_VERBS_RESP_EX(ex_ptr, ex_type, ibv_type) \ + IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, comp_mask) + struct ibv_query_params { __u32 command; __u16 in_words; @@ -254,6 +309,27 @@ __u32 pd_handle; }; +struct ibv_open_xrcd { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 fd; + __u32 oflags; + __u64 driver_data[0]; +}; + +struct ibv_open_xrcd_resp { + __u32 xrcd_handle; +}; + +struct ibv_close_xrcd { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 xrcd_handle; +}; + struct ibv_reg_mr { __u32 command; __u16 in_words; @@ -280,6 +356,28 @@ __u32 mr_handle; }; +struct ibv_alloc_mw { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 pd_handle; + __u8 mw_type; + __u8 reserved[3]; +}; + +struct ibv_alloc_mw_resp { + __u32 mw_handle; + __u32 rkey; +}; + +struct ibv_dealloc_mw { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 mw_handle; +}; + struct ibv_create_comp_channel { __u32 command; __u16 in_words; @@ -460,6 +558,20 @@ __u64 driver_data[0]; }; +struct ibv_open_qp { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 qpn; + __u8 qp_type; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +/* also used for open response */ struct ibv_create_qp_resp { __u32 qp_handle; __u32 qpn; @@ -471,6 +583,49 @@ __u32 reserved; }; +enum ibv_create_qp_ex_comp_mask { + IBV_CREATE_QP_EX_CAP_FLAGS = (1ULL << 0) +}; + +struct ibv_create_qp_ex { + __u32 command; + __u16 in_words; + __u16 out_words; + __u16 provider_in_words; + __u16 provider_out_words; + __u32 cmd_hdr_reserved; + __u64 comp_mask; + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u64 qp_cap_flags; + __u64 driver_data[0]; +}; + +struct ibv_create_qp_resp_ex { + __u64 comp_mask; + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 reserved; +}; + struct ibv_qp_dest { __u8 dgid[16]; __u32 flow_label; @@ -576,93 +731,6 @@ __u32 events_reported; }; -struct ibv_create_xrc_rcv_qp { - __u32 command; - __u16 in_words; - __u16 out_words; - __u64 response; - __u64 user_handle; - __u32 xrc_domain_handle; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u8 sq_sig_all; - __u8 qp_type; - __u8 reserved[6]; - __u64 driver_data[0]; -}; - -struct ibv_create_xrc_rcv_qp_resp { - __u32 qpn; - __u32 reserved; -}; - -struct ibv_modify_xrc_rcv_qp { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 xrc_domain_handle; - __u32 qp_num; - struct ibv_qp_dest dest; - struct ibv_qp_dest alt_dest; - __u32 attr_mask; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 qp_state; - __u8 cur_qp_state; - __u8 path_mtu; - __u8 path_mig_state; - __u8 en_sqd_async_notify; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; - __u8 reserved[6]; - __u64 driver_data[0]; -}; - -struct ibv_query_xrc_rcv_qp { - __u32 command; - __u16 in_words; - __u16 out_words; - __u64 response; - __u32 xrc_domain_handle; - __u32 qp_num; - __u32 attr_mask; - __u32 reserved; - __u64 driver_data[0]; -}; - -struct ibv_reg_xrc_rcv_qp { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 xrc_domain_handle; - __u32 qp_num; - __u64 driver_data[0]; -}; - -struct ibv_unreg_xrc_rcv_qp { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 xrc_domain_handle; - __u32 qp_num; - __u64 driver_data[0]; -}; - struct ibv_kern_send_wr { __u64 wr_id; __u32 num_sge; @@ -689,6 +757,95 @@ __u32 reserved; } ud; } wr; + union { + struct { + __u32 remote_srqn; + } xrc; + } qp_type; +}; + +struct ibv_kern_eth_filter { + __u8 dst_mac[6]; + __u8 src_mac[6]; + __u16 ether_type; + __u16 vlan_tag; +}; + +struct ibv_kern_spec_eth { + __u32 type; + __u16 size; + __u16 reserved; + struct ibv_kern_eth_filter val; + struct ibv_kern_eth_filter mask; +}; + +struct ibv_kern_ib_filter { + __u32 qpn; + __u8 dst_gid[16]; +}; + +struct ibv_kern_spec_ib { + __u32 type; + __u16 size; + __u16 reserved; + struct ibv_kern_ib_filter val; + struct ibv_kern_ib_filter mask; +}; + +struct ibv_kern_ipv4_filter { + __u32 src_ip; + __u32 dst_ip; +}; + +struct ibv_kern_spec_ipv4 { + __u32 type; + __u16 size; + __u16 reserved; + struct ibv_kern_ipv4_filter val; + struct ibv_kern_ipv4_filter mask; +}; + +struct ibv_kern_tcp_udp_filter { + __u16 dst_port; + __u16 src_port; +}; + +struct ibv_kern_spec_tcp_udp { + __u32 type; + __u16 size; + __u16 reserved; + struct ibv_kern_tcp_udp_filter val; + struct ibv_kern_tcp_udp_filter mask; +}; + + +struct ibv_kern_spec { + union { + struct { + __u32 type; + __u16 size; + __u16 reserved; + } hdr; + struct ibv_kern_spec_ib ib; + struct ibv_kern_spec_eth eth; + struct ibv_kern_spec_ipv4 ipv4; + struct ibv_kern_spec_tcp_udp tcp_udp; + }; + +}; + +struct ibv_kern_flow_attr { + __u32 type; + __u16 size; + __u16 priority; + __u8 num_of_specs; + __u8 reserved[2]; + __u8 port; + __u32 flags; + /* Following are the optional layers according to user request + * struct ibv_kern_flow_spec_xxx + * struct ibv_kern_flow_spec_yyy + */ }; struct ibv_post_send { @@ -789,6 +946,24 @@ __u64 driver_data[0]; }; +struct ibv_create_flow { + struct ex_hdr hdr; + __u32 comp_mask; + __u32 qp_handle; + struct ibv_kern_flow_attr flow_attr; +}; + +struct ibv_create_flow_resp { + __u32 comp_mask; + __u32 flow_handle; +}; + +struct ibv_destroy_flow { + struct ex_hdr hdr; + __u32 comp_mask; + __u32 flow_handle; +}; + struct ibv_create_srq { __u32 command; __u16 in_words; @@ -802,18 +977,20 @@ __u64 driver_data[0]; }; -struct ibv_create_xrc_srq { +struct ibv_create_xsrq { __u32 command; __u16 in_words; __u16 out_words; __u64 response; __u64 user_handle; + __u32 srq_type; __u32 pd_handle; __u32 max_wr; __u32 max_sge; __u32 srq_limit; + __u32 reserved; __u32 xrcd_handle; - __u32 xrc_cq; + __u32 cq_handle; __u64 driver_data[0]; }; @@ -821,7 +998,7 @@ __u32 srq_handle; __u32 max_wr; __u32 max_sge; - __u32 reserved; + __u32 srqn; }; struct ibv_modify_srq { @@ -865,30 +1042,6 @@ __u32 events_reported; }; -struct ibv_open_xrc_domain { - __u32 command; - __u16 in_words; - __u16 out_words; - __u64 response; - __u32 fd; - __u32 oflags; - __u64 driver_data[0]; -}; - -struct ibv_open_xrc_domain_resp { - __u32 xrcd_handle; -}; - -struct ibv_close_xrc_domain { - __u32 command; - __u16 in_words; - __u16 out_words; - __u64 response; - __u32 xrcd_handle; - __u32 reserved; - __u64 driver_data[0]; -}; - /* * Compatibility with older ABI versions */ @@ -938,55 +1091,22 @@ * trick opcodes in IBV_INIT_CMD() doesn't break. */ IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL_V2 = -1, - IB_USER_VERBS_CMD_CREATE_XRC_SRQ_V2 = -1, - IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN_V2 = -1, - IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN_V2 = -1, - IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP_V2 = -1, - IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP_V2 = -1, - IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP_V2 = -1, - IB_USER_VERBS_CMD_REG_XRC_RCV_QP_V2 = -1, - IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP_V2 = -1, -}; - -struct ibv_destroy_cq_v1 { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 cq_handle; -}; - -struct ibv_destroy_qp_v1 { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 qp_handle; -}; - -struct ibv_destroy_srq_v1 { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 srq_handle; -}; - -struct ibv_get_context_v2 { - __u32 command; - __u16 in_words; - __u16 out_words; - __u64 response; - __u64 cq_fd_tab; - __u64 driver_data[0]; -}; - -struct ibv_create_cq_v2 { - __u32 command; - __u16 in_words; - __u16 out_words; - __u64 response; - __u64 user_handle; - __u32 cqe; - __u32 event_handler; - __u64 driver_data[0]; + IB_USER_VERBS_CMD_CREATE_QP_EX_V2 = -1, + IB_USER_VERBS_CMD_MODIFY_CQ_EX_V2 = -1, + IB_USER_VERBS_CMD_CREATE_FLOW_V2 = -1, + IB_USER_VERBS_CMD_DESTROY_FLOW_V2 = -1, + IB_USER_VERBS_CMD_OPEN_XRCD_V2 = -1, + IB_USER_VERBS_CMD_CLOSE_XRCD_V2 = -1, + IB_USER_VERBS_CMD_CREATE_XSRQ_V2 = -1, + IB_USER_VERBS_CMD_OPEN_QP_V2 = -1, + IB_USER_VERBS_CMD_MODIFY_QP_EX_V2 = -1, + IB_USER_VERBS_CMD_CREATE_CQ_EX_V2 = -1, + IB_USER_VERBS_CMD_QUERY_DEVICE_EX_V2 = -1, + IB_USER_VERBS_CMD_CREATE_DCT_V2 = -1, + IB_USER_VERBS_CMD_DESTROY_DCT_V2 = -1, + IB_USER_VERBS_CMD_QUERY_DCT_V2 = -1, + IB_USER_VERBS_CMD_EXP_REG_MR_V2 = -1, + IB_USER_VERBS_CMD_EXP_PREFETCH_MR_V2 = -1, }; struct ibv_modify_srq_v3 { Index: contrib/ofed/libibverbs/include/infiniband/kern-abi_exp.h =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/include/infiniband/kern-abi_exp.h @@ -0,0 +1,623 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef KERN_ABI_EXP_H +#define KERN_ABI_EXP_H + +#include + +/* + * This file must be kept in sync with the kernel's version of + * drivers/infiniband/include/ib_user_verbs_exp.h + */ + +enum { + IB_USER_VERBS_EXP_CMD_FIRST = 64 +}; + +enum { + IB_USER_VERBS_EXP_CMD_CREATE_QP, + IB_USER_VERBS_EXP_CMD_MODIFY_CQ, + IB_USER_VERBS_EXP_CMD_MODIFY_QP, + IB_USER_VERBS_EXP_CMD_CREATE_CQ, + IB_USER_VERBS_EXP_CMD_QUERY_DEVICE, + IB_USER_VERBS_EXP_CMD_CREATE_DCT, + IB_USER_VERBS_EXP_CMD_DESTROY_DCT, + IB_USER_VERBS_EXP_CMD_QUERY_DCT, + IB_USER_VERBS_EXP_CMD_ARM_DCT, + IB_USER_VERBS_EXP_CMD_CREATE_MR, + IB_USER_VERBS_EXP_CMD_QUERY_MKEY, + IB_USER_VERBS_EXP_CMD_REG_MR, + IB_USER_VERBS_EXP_CMD_PREFETCH_MR, + IB_USER_VERBS_EXP_CMD_REREG_MR, + IB_USER_VERBS_EXP_CMD_CREATE_WQ, + IB_USER_VERBS_EXP_CMD_MODIFY_WQ, + IB_USER_VERBS_EXP_CMD_DESTROY_WQ, + IB_USER_VERBS_EXP_CMD_CREATE_RWQ_IND_TBL, + IB_USER_VERBS_EXP_CMD_DESTROY_RWQ_IND_TBL, + IB_USER_VERBS_EXP_CMD_CREATE_FLOW, +}; + +enum { + IB_USER_VERBS_CMD_EXP_CREATE_WQ = + IB_USER_VERBS_EXP_CMD_CREATE_WQ + + IB_USER_VERBS_EXP_CMD_FIRST, + IB_USER_VERBS_CMD_EXP_MODIFY_WQ = + IB_USER_VERBS_EXP_CMD_MODIFY_WQ + + IB_USER_VERBS_EXP_CMD_FIRST, + IB_USER_VERBS_CMD_EXP_DESTROY_WQ = + IB_USER_VERBS_EXP_CMD_DESTROY_WQ + + IB_USER_VERBS_EXP_CMD_FIRST, + IB_USER_VERBS_CMD_EXP_CREATE_RWQ_IND_TBL = + IB_USER_VERBS_EXP_CMD_CREATE_RWQ_IND_TBL + + IB_USER_VERBS_EXP_CMD_FIRST, + IB_USER_VERBS_CMD_EXP_DESTROY_RWQ_IND_TBL = + IB_USER_VERBS_EXP_CMD_DESTROY_RWQ_IND_TBL + + IB_USER_VERBS_EXP_CMD_FIRST, + /* + * Set commands that didn't exist to -1 so our compile-time + * trick opcodes in IBV_INIT_CMD() doesn't break. + */ + IB_USER_VERBS_CMD_EXP_CREATE_WQ_V2 = -1, + IB_USER_VERBS_CMD_EXP_MODIFY_WQ_V2 = -1, + IB_USER_VERBS_CMD_EXP_DESTROY_WQ_V2 = -1, + IB_USER_VERBS_CMD_EXP_CREATE_RWQ_IND_TBL_V2 = -1, + IB_USER_VERBS_CMD_EXP_DESTROY_RWQ_IND_TBL_V2 = -1, +}; + +enum ibv_exp_create_qp_comp_mask { + IBV_EXP_CREATE_QP_CAP_FLAGS = (1ULL << 0), + IBV_EXP_CREATE_QP_INL_RECV = (1ULL << 1), + IBV_EXP_CREATE_QP_QPG = (1ULL << 2), + IBV_EXP_CREATE_QP_MAX_INL_KLMS = (1ULL << 3) +}; + +struct ibv_create_qpg_init_attrib { + __u32 tss_child_count; + __u32 rss_child_count; +}; + +struct ibv_create_qpg { + __u32 qpg_type; + union { + struct { + __u32 parent_handle; + __u32 reserved; + }; + struct ibv_create_qpg_init_attrib parent_attrib; + }; + __u32 reserved2; +}; + +enum ibv_exp_create_qp_kernel_flags { + IBV_EXP_CREATE_QP_KERNEL_FLAGS = IBV_EXP_QP_CREATE_CROSS_CHANNEL | + IBV_EXP_QP_CREATE_MANAGED_SEND | + IBV_EXP_QP_CREATE_MANAGED_RECV | + IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY +}; + +struct ibv_exp_create_qp { + struct ex_hdr hdr; + __u64 comp_mask; + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u64 qp_cap_flags; + __u32 max_inl_recv; + __u32 reserved1; + struct ibv_create_qpg qpg; + __u64 max_inl_send_klms; + struct { + __u64 rx_hash_fields_mask; + __u32 rwq_ind_tbl_handle; + __u8 rx_hash_function; + __u8 rx_hash_key_len; + __u8 rx_hash_key[128]; + __u16 reserved; + } rx_hash_info; + __u8 port_num; + __u8 reserved_2[7]; + __u64 driver_data[0]; +}; + +enum ibv_exp_create_qp_resp_comp_mask { + IBV_EXP_CREATE_QP_RESP_INL_RECV = (1ULL << 0), +}; + +struct ibv_exp_create_qp_resp { + __u64 comp_mask; + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 max_inl_recv; +}; + +struct ibv_exp_umr_caps_resp { + __u32 max_klm_list_size; + __u32 max_send_wqe_inline_klms; + __u32 max_umr_recursion_depth; + __u32 max_umr_stride_dimension; +}; + +struct ibv_exp_odp_caps_resp { + __u64 comp_mask; + __u64 general_odp_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + __u32 dc_odp_caps; + __u32 xrc_odp_caps; + __u32 raw_eth_odp_caps; + } per_transport_caps; +}; + +struct ibv_exp_query_device { + struct ex_hdr hdr; + __u64 comp_mask; + __u64 driver_data[0]; +}; + +struct ibv_exp_rx_hash_caps_resp { + __u32 max_rwq_indirection_tables; + __u32 max_rwq_indirection_table_size; + __u64 supported_packet_fields; + __u32 supported_qps; + __u8 supported_hash_functions; + __u8 reserved[3]; +}; + +struct ibv_exp_mp_rq_caps_resp { + __u32 supported_qps; /* use ibv_exp_supported_qp_types */ + __u32 allowed_shifts; /* use ibv_exp_mp_rq_shifts */ + __u8 min_single_wqe_log_num_of_strides; + __u8 max_single_wqe_log_num_of_strides; + __u8 min_single_stride_log_num_of_bytes; + __u8 max_single_stride_log_num_of_bytes; + __u32 reserved; +}; + +struct ibv_exp_query_device_resp { + __u64 comp_mask; + __u64 fw_ver; + __u64 node_guid; + __u64 sys_image_guid; + __u64 max_mr_size; + __u64 page_size_cap; + __u32 vendor_id; + __u32 vendor_part_id; + __u32 hw_ver; + __u32 max_qp; + __u32 max_qp_wr; + __u32 device_cap_flags; + __u32 max_sge; + __u32 max_sge_rd; + __u32 max_cq; + __u32 max_cqe; + __u32 max_mr; + __u32 max_pd; + __u32 max_qp_rd_atom; + __u32 max_ee_rd_atom; + __u32 max_res_rd_atom; + __u32 max_qp_init_rd_atom; + __u32 max_ee_init_rd_atom; + __u32 exp_atomic_cap; + __u32 max_ee; + __u32 max_rdd; + __u32 max_mw; + __u32 max_raw_ipv6_qp; + __u32 max_raw_ethy_qp; + __u32 max_mcast_grp; + __u32 max_mcast_qp_attach; + __u32 max_total_mcast_qp_attach; + __u32 max_ah; + __u32 max_fmr; + __u32 max_map_per_fmr; + __u32 max_srq; + __u32 max_srq_wr; + __u32 max_srq_sge; + __u16 max_pkeys; + __u8 local_ca_ack_delay; + __u8 phys_port_cnt; + __u8 reserved[4]; + __u64 timestamp_mask; + __u64 hca_core_clock; + __u64 device_cap_flags2; + __u32 dc_rd_req; + __u32 dc_rd_res; + __u32 inline_recv_sz; + __u32 max_rss_tbl_sz; + __u64 log_atomic_arg_sizes; + __u32 max_fa_bit_boundary; + __u32 log_max_atomic_inline; + struct ibv_exp_umr_caps_resp umr_caps; + struct ibv_exp_odp_caps_resp odp_caps; + __u32 max_dct; + __u32 max_ctx_res_domain; + struct ibv_exp_rx_hash_caps_resp rx_hash; + __u32 max_wq_type_rq; + __u32 max_device_ctx; + struct ibv_exp_mp_rq_caps_resp mp_rq_caps; +}; + +struct ibv_exp_create_dct { + struct ex_hdr hdr; + __u64 comp_mask; + __u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 srq_handle; + __u32 access_flags; + __u64 dc_key; + __u32 flow_label; + __u8 min_rnr_timer; + __u8 tclass; + __u8 port; + __u8 pkey_index; + __u8 gid_index; + __u8 hop_limit; + __u8 mtu; + __u8 rsvd0; + __u32 create_flags; + __u32 inline_size; + __u32 rsvd1; + __u64 driver_data[0]; +}; + +struct ibv_exp_create_dct_resp { + __u32 dct_handle; + __u32 dct_num; + __u32 inline_size; + __u32 rsvd; +}; + +struct ibv_exp_destroy_dct { + struct ex_hdr hdr; + __u64 comp_mask; + __u32 dct_handle; + __u32 rsvd; + __u64 driver_data[0]; +}; + +struct ibv_exp_destroy_dct_resp { + __u32 events_reported; + __u32 reserved; +}; + +struct ibv_exp_query_dct { + struct ex_hdr hdr; + __u64 comp_mask; + __u32 dct_handle; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_exp_query_dct_resp { + __u64 dc_key; + __u32 access_flags; + __u32 flow_label; + __u32 key_violations; + __u8 port; + __u8 min_rnr_timer; + __u8 tclass; + __u8 mtu; + __u8 pkey_index; + __u8 gid_index; + __u8 hop_limit; + __u8 state; + __u32 rsvd; + __u64 driver_data[0]; +}; + +struct ibv_exp_arm_dct { + struct ex_hdr hdr; + __u64 comp_mask; + __u32 dct_handle; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_exp_arm_dct_resp { + __u64 driver_data[0]; +}; + +struct ibv_exp_modify_cq { + struct ex_hdr hdr; + __u32 cq_handle; + __u32 attr_mask; + __u16 cq_count; + __u16 cq_period; + __u32 cq_cap_flags; + __u32 comp_mask; + __u32 rsvd; +}; + +struct ibv_exp_modify_qp { + struct ex_hdr hdr; + __u32 comp_mask; + struct ibv_qp_dest dest; + struct ibv_qp_dest alt_dest; + __u32 qp_handle; + __u32 attr_mask; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 en_sqd_async_notify; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[6]; + __u64 dct_key; + __u32 exp_attr_mask; + __u32 flow_entropy; + __u64 driver_data[0]; +}; + +enum ibv_exp_create_cq_comp_mask { + IBV_EXP_CREATE_CQ_CAP_FLAGS = (uint64_t)1 << 0, +}; + +struct ibv_exp_create_cq { + struct ex_hdr hdr; + __u64 comp_mask; + __u64 user_handle; + __u32 cqe; + __u32 comp_vector; + __s32 comp_channel; + __u32 reserved; + __u64 create_flags; + __u64 driver_data[0]; +}; + +struct ibv_exp_create_mr { + struct ex_hdr hdr; + __u64 comp_mask; + __u32 pd_handle; + __u32 max_klm_list_size; + __u64 exp_access_flags; + __u32 create_flags; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_exp_create_mr_resp { + __u64 comp_mask; + __u32 handle; + __u32 lkey; + __u32 rkey; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_exp_query_mkey { + struct ex_hdr hdr; + __u64 comp_mask; + __u32 handle; + __u32 lkey; + __u32 rkey; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_exp_query_mkey_resp { + __u64 comp_mask; + __u32 max_klm_list_size; + __u32 reserved; + __u64 driver_data[0]; +}; + +enum ibv_exp_reg_mr_comp_mask { + IBV_EXP_REG_MR_EXP_ACCESS_FLAGS = 1ULL << 0, +}; + +struct ibv_exp_reg_mr { + struct ex_hdr hdr; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 reserved; + __u64 exp_access_flags; + __u64 comp_mask; +}; + +struct ibv_exp_prefetch_mr { + struct ex_hdr hdr; + __u64 comp_mask; + __u32 mr_handle; + __u32 flags; + __u64 start; + __u64 length; +}; + +struct ibv_exp_reg_mr_resp { + __u32 mr_handle; + __u32 lkey; + __u32 rkey; + __u32 reserved; + __u64 comp_mask; +}; + +struct ibv_exp_rereg_mr { + struct ex_hdr hdr; + __u32 comp_mask; + __u32 mr_handle; + __u32 flags; + __u32 reserved; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; +}; + +struct ibv_exp_rereg_mr_resp { + __u32 comp_mask; + __u32 lkey; + __u32 rkey; + __u32 reserved; +}; + +struct ibv_exp_cmd_wq_mp_rq { + __u32 use_shift; /* use ibv_exp_mp_rq_shifts */ + __u8 single_wqe_log_num_of_strides; + __u8 single_stride_log_num_of_bytes; + __u16 reserved; +}; + +enum ibv_exp_cmd_create_wq_comp_mask { + IBV_EXP_CMD_CREATE_WQ_MP_RQ = 1 << 0, +}; + +struct ibv_exp_create_wq { + struct ex_hdr hdr; + __u32 comp_mask; /* enum ibv_exp_cmd_create_wq_comp_mask */ + __u32 wq_type; /* enum ibv_exp_wq_type */ + __u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 srq_handle; + __u32 max_recv_wr; + __u32 max_recv_sge; + __u32 reserved; + struct ibv_exp_cmd_wq_mp_rq mp_rq; +}; + +struct ibv_exp_create_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 wq_handle; + __u32 max_recv_wr; + __u32 max_recv_sge; + __u32 wqn; +}; + +struct ib_exp_destroy_wq { + struct ex_hdr hdr; + __u32 comp_mask; + __u32 wq_handle; +}; + +struct ib_exp_modify_wq { + struct ex_hdr hdr; + __u32 comp_mask; + __u32 wq_handle; + __u32 wq_state; + __u32 curr_wq_state; +}; + +struct ibv_exp_create_rwq_ind_table { + struct ex_hdr hdr; + __u32 comp_mask; + __u32 pd_handle; + __u32 log_ind_tbl_size; + __u32 reserved; + /* Following are wq handles based on log_ind_tbl_size, must be 64 bytes aligned. + * __u32 wq_handle1 + * __u32 wq_handle2 + */ + __u32 wq_handles[0]; +}; + +struct ibv_exp_create_rwq_ind_table_resp { + __u32 comp_mask; + __u32 response_length; + __u32 ind_tbl_handle; + __u32 ind_tbl_num; +}; + +struct ibv_exp_destroy_rwq_ind_table { + struct ex_hdr hdr; + __u32 comp_mask; + __u32 ind_tbl_handle; +}; + +struct ibv_exp_kern_ipv6_filter { + __u8 src_ip[16]; + __u8 dst_ip[16]; +}; + +struct ibv_exp_kern_spec_ipv6 { + __u32 type; + __u16 size; + __u16 reserved; + struct ibv_exp_kern_ipv6_filter val; + struct ibv_exp_kern_ipv6_filter mask; +}; + + +struct ibv_exp_kern_spec { + union { + struct { + __u32 type; + __u16 size; + __u16 reserved; + } hdr; + struct ibv_kern_spec_ib ib; + struct ibv_kern_spec_eth eth; + struct ibv_kern_spec_ipv4 ipv4; + struct ibv_kern_spec_tcp_udp tcp_udp; + struct ibv_exp_kern_spec_ipv6 ipv6; + }; +}; +#endif /* KERN_ABI_EXP_H */ Index: contrib/ofed/libibverbs/include/infiniband/ofa_verbs.h =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/include/infiniband/ofa_verbs.h @@ -0,0 +1,210 @@ +#ifndef INFINIBAND_OFA_VERBS_H +#define INFINIBAND_OFA_VERBS_H + +struct ibv_srq_init_attr; +struct ibv_cq; +struct ibv_pd; +struct ibv_qp_init_attr; +struct ibv_qp_attr; + + +#ifdef __GNUC__ +#define DEPRECATED __attribute__((deprecated)) +#else +#define DEPRECATED +#endif + +/* XRC compatability layer */ +#define LEGACY_XRC_SRQ_HANDLE 0xffffffff + +struct ibv_xrc_domain { + struct ibv_context *context; + uint32_t handle; +}; + +struct ibv_srq_legacy { + struct ibv_context *context; + void *srq_context; + struct ibv_pd *pd; + uint32_t handle; + + uint32_t events_completed; + + uint32_t xrc_srq_num_bin_compat; + struct ibv_xrc_domain *xrc_domain_bin_compat; + struct ibv_cq *xrc_cq_bin_compat; + + pthread_mutex_t mutex; + pthread_cond_t cond; + /* Here we hook the new one from OFED 2.0 */ + void *ibv_srq; + /* Below 3 fields are for legacy source compatibility, reside + * on same offset as of those fields in struct ibv_srq. + */ + uint32_t xrc_srq_num; + struct ibv_xrc_domain *xrc_domain; + struct ibv_cq *xrc_cq; +}; + +/** + * ibv_open_xrc_domain - open an XRC domain + * Returns a reference to an XRC domain. + * + * @context: Device context + * @fd: descriptor for inode associated with the domain + * If fd == -1, no inode is associated with the domain; in this ca= se, + * the only legal value for oflag is O_CREAT + * + * @oflag: oflag values are constructed by OR-ing flags from the following list + * + * O_CREAT + * If a domain belonging to device named by context is already associated + * with the inode, this flag has no effect, except as noted under O_EXCL + * below. Otherwise, a new XRC domain is created and is associated with + * inode specified by fd. + * + * O_EXCL + * If O_EXCL and O_CREAT are set, open will fail if a domain associated with + * the inode exists. The check for the existence of the domain and creation + * of the domain if it does not exist is atomic with respect to other + * processes executing open with fd naming the same inode. + */ +struct ibv_xrc_domain *ibv_open_xrc_domain(struct ibv_context *context, + int fd, int oflag) DEPRECATED; + +/** + * ibv_create_xrc_srq - Creates a SRQ associated with the specified protection + * domain and xrc domain. + * @pd: The protection domain associated with the SRQ. + * @xrc_domain: The XRC domain associated with the SRQ. + * @xrc_cq: CQ to report completions for XRC packets on. + * + * @srq_init_attr: A list of initial attributes required to create the SRQ. + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ibv_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ibv_srq *ibv_create_xrc_srq(struct ibv_pd *pd, + struct ibv_xrc_domain *xrc_domain, + struct ibv_cq *xrc_cq, + struct ibv_srq_init_attr *srq_init_attr) DEPRECATED; + +/** + * ibv_close_xrc_domain - close an XRC domain + * If this is the last reference, destroys the domain. + * + * @d: reference to XRC domain to close + * + * close is implicitly performed at process exit. + */ +int ibv_close_xrc_domain(struct ibv_xrc_domain *d) DEPRECATED; + +/** + * ibv_create_xrc_rcv_qp - creates an XRC QP for serving as a receive-side-only QP, + * + * This QP is created in kernel space, and persists until the last process + * registered for the QP calls ibv_unreg_xrc_rcv_qp() (at which time the QP + * is destroyed). + * + * @init_attr: init attributes to use for QP. xrc domain MUST be included here. + * All other fields are ignored. + * + * @xrc_rcv_qpn: qp_num of created QP (if success). To be passed to the + * remote node (sender). The remote node will use xrc_rcv_qpn + * in ibv_post_send when sending to XRC SRQ's on this host + * in the same xrc domain. + * + * RETURNS: success (0), or a (negative) error value. + * + * NOTE: this verb also registers the calling user-process with the QP at its + * creation time (implicit call to ibv_reg_xrc_rcv_qp), to avoid race + * conditions. The creating process will need to call ibv_unreg_xrc_qp() + * for the QP to release it from this process. + */ +int ibv_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, + uint32_t *xrc_rcv_qpn) DEPRECATED; + +/** + * ibv_modify_xrc_rcv_qp - modifies an xrc_rcv qp. + * + * @xrc_domain: xrc domain the QP belongs to (for verification). + * @xrc_qp_num: The (24 bit) number of the XRC QP. + * @attr: modify-qp attributes. The following fields must be specified: + * for RESET_2_INIT: qp_state, pkey_index , port, qp_access_flags + * for INIT_2_RTR: qp_state, path_mtu, dest_qp_num, rq_psn, + * max_dest_rd_atomic, min_rnr_timer, ah_attr + * The QP need not be brought to RTS for the QP to operate as a + * receive-only QP. + * @attr_mask: bitmap indicating which attributes are provided in the attr + * struct. Used for validity checking. + * The following bits must be set: + * for RESET_2_INIT: IBV_QP_PKEY_INDEX, IBV_QP_PORT, + * IBV_QP_ACCESS_FLAGS, IBV_QP_STATE + * for INIT_2_RTR: IBV_QP_AV, IBV_QP_PATH_MTU, IBV_QP_DEST_QPN, + * IBV_QP_RQ_PSN, IBV_QP_MAX_DEST_RD_ATOMIC, + * IBV_QP_MIN_RNR_TIMER, IBV_QP_STATE + * + * RETURNS: success (0), or a (positive) error value. + * + */ +int ibv_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, + uint32_t xrc_qp_num, + struct ibv_qp_attr *attr, int attr_mask) DEPRECATED; + +/** + * ibv_query_xrc_rcv_qp - queries an xrc_rcv qp. + * + * @xrc_domain: xrc domain the QP belongs to (for verification). + * @xrc_qp_num: The (24 bit) number of the XRC QP. + * @attr: for returning qp attributes. + * @attr_mask: bitmap indicating which attributes to return. + * @init_attr: for returning the init attributes + * + * RETURNS: success (0), or a (positive) error value. + * + */ +int ibv_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num, + struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr) DEPRECATED; + +/** + * ibv_reg_xrc_rcv_qp: registers a user process with an XRC QP which serves as + * a receive-side only QP. + * + * @xrc_domain: xrc domain the QP belongs to (for verification). + * @xrc_qp_num: The (24 bit) number of the XRC QP. + * + * RETURNS: success (0), + * or error (EINVAL), if: + * 1. There is no such QP_num allocated. + * 2. The QP is allocated, but is not an receive XRC QP + * 3. The XRC QP does not belong to the given domain. + */ +int ibv_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, + uint32_t xrc_qp_num) DEPRECATED; + +/** + * ibv_unreg_xrc_rcv_qp: detaches a user process from an XRC QP serving as + * a receive-side only QP. If as a result, there are no remaining + * userspace processes registered for this XRC QP, it is destroyed. + * + * @xrc_domain: xrc domain the QP belongs to (for verification). + * @xrc_qp_num: The (24 bit) number of the XRC QP. + * + * RETURNS: success (0), + * or error (EINVAL), if: + * 1. There is no such QP_num allocated. + * 2. The QP is allocated, but is not an XRC QP + * 3. The XRC QP does not belong to the given domain. + * NOTE: There is no reason to return a special code if the QP is destroyed. + * The unregister simply succeeds. + */ +int ibv_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, + uint32_t xrc_qp_num) DEPRECATED; + + +#endif + + Index: contrib/ofed/libibverbs/include/infiniband/verbs.h =================================================================== --- contrib/ofed/libibverbs/include/infiniband/verbs.h +++ contrib/ofed/libibverbs/include/infiniband/verbs.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * @@ -38,6 +38,9 @@ #include #include +#include +#include +#include #ifdef __cplusplus # define BEGIN_C_DECLS extern "C" { @@ -63,18 +66,46 @@ } global; }; +#ifndef container_of +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) \ + ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) +#endif + +#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) + +static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; + enum ibv_node_type { IBV_NODE_UNKNOWN = -1, IBV_NODE_CA = 1, IBV_NODE_SWITCH, IBV_NODE_ROUTER, - IBV_NODE_RNIC + IBV_NODE_RNIC, + + /* Leave a gap for future node types before starting with + * experimental node types. + */ + IBV_EXP_NODE_TYPE_START = 32, + IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START }; enum ibv_transport_type { IBV_TRANSPORT_UNKNOWN = -1, IBV_TRANSPORT_IB = 0, - IBV_TRANSPORT_IWARP + IBV_TRANSPORT_IWARP, + + /* Leave a gap for future transport types before starting with + * experimental transport types. + */ + IBV_EXP_TRANSPORT_TYPE_START = 32, + IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START }; enum ibv_device_cap_flags { @@ -93,7 +124,8 @@ IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, IBV_DEVICE_SRQ_RESIZE = 1 << 13, IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, - IBV_DEVICE_XRC = 1 << 20 + IBV_DEVICE_XRC = 1 << 20, + IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 }; enum ibv_atomic_cap { @@ -166,6 +198,34 @@ IBV_LINK_LAYER_UNSPECIFIED, IBV_LINK_LAYER_INFINIBAND, IBV_LINK_LAYER_ETHERNET, + + /* Leave a gap for future link layer types before starting with + * experimental link layer. + */ + IBV_EXP_LINK_LAYER_START = 32, + IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START +}; + +enum ibv_port_cap_flags { + IBV_PORT_SM = 1 << 1, + IBV_PORT_NOTICE_SUP = 1 << 2, + IBV_PORT_TRAP_SUP = 1 << 3, + IBV_PORT_OPT_IPD_SUP = 1 << 4, + IBV_PORT_AUTO_MIGR_SUP = 1 << 5, + IBV_PORT_SL_MAP_SUP = 1 << 6, + IBV_PORT_MKEY_NVRAM = 1 << 7, + IBV_PORT_PKEY_NVRAM = 1 << 8, + IBV_PORT_LED_INFO_SUP = 1 << 9, + IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, + IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, + IBV_PORT_CM_SUP = 1 << 16, + IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, + IBV_PORT_REINIT_SUP = 1 << 18, + IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, + IBV_PORT_VENDOR_CLASS = 1 << 24, + IBV_PORT_CLIENT_REG_SUP = 1 << 25, + IBV_PORT_IP_BASED_GIDS = 1 << 26, }; struct ibv_port_attr { @@ -189,7 +249,7 @@ uint8_t active_speed; uint8_t phys_state; uint8_t link_layer; - uint8_t pad; + uint8_t reserved; }; enum ibv_event_type { @@ -212,10 +272,13 @@ IBV_EVENT_QP_LAST_WQE_REACHED, IBV_EVENT_CLIENT_REREGISTER, IBV_EVENT_GID_CHANGE, -}; -enum ibv_event_flags { - IBV_XRC_QP_EVENT_FLAG = 0x80000000, + /* new experimental events start here leaving enough + * room for 14 events which should be enough + */ + IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32, + IBV_EXP_EVENT_DCT_ACCESS_ERR, + IBV_EXP_EVENT_DCT_REQ_ERR, }; struct ibv_async_event { @@ -223,7 +286,9 @@ struct ibv_cq *cq; struct ibv_qp *qp; struct ibv_srq *srq; + struct ibv_exp_dct *dct; int port_num; + /* For source compatible with Legacy API */ uint32_t xrc_qp_num; } element; enum ibv_event_type event_type; @@ -304,6 +369,22 @@ uint32_t handle; }; +enum ibv_xrcd_init_attr_mask { + IBV_XRCD_INIT_ATTR_FD = 1 << 0, + IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, + IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_xrcd_init_attr { + uint32_t comp_mask; + int fd; + int oflags; +}; + +struct ibv_xrcd { + struct ibv_context *context; +}; + enum ibv_rereg_mr_flags { IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), IBV_REREG_MR_CHANGE_PD = (1 << 1), @@ -359,7 +440,15 @@ IBV_RATE_40_GBPS = 7, IBV_RATE_60_GBPS = 8, IBV_RATE_80_GBPS = 9, - IBV_RATE_120_GBPS = 10 + IBV_RATE_120_GBPS = 10, + IBV_RATE_14_GBPS = 11, + IBV_RATE_56_GBPS = 12, + IBV_RATE_112_GBPS = 13, + IBV_RATE_168_GBPS = 14, + IBV_RATE_25_GBPS = 15, + IBV_RATE_100_GBPS = 16, + IBV_RATE_200_GBPS = 17, + IBV_RATE_300_GBPS = 18 }; /** @@ -376,6 +465,19 @@ */ enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; +/** + * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. + * For example, IBV_RATE_5_GBPS will return the value 5000. + * @rate: rate to convert. + */ +int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; + +/** + * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. + * @mbps: value to convert. + */ +enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; + struct ibv_ah_attr { struct ibv_global_route grh; uint16_t dlid; @@ -386,11 +488,6 @@ uint8_t port_num; }; -struct ibv_xrc_domain { - struct ibv_context *context; - uint32_t handle; -}; - enum ibv_srq_attr_mask { IBV_SRQ_MAX_WR = 1 << 0, IBV_SRQ_LIMIT = 1 << 1 @@ -407,12 +504,46 @@ struct ibv_srq_attr attr; }; +enum ibv_srq_type { + IBV_SRQT_BASIC, + IBV_SRQT_XRC +}; + +enum ibv_srq_init_attr_mask { + IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, + IBV_SRQ_INIT_ATTR_PD = 1 << 1, + IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, + IBV_SRQ_INIT_ATTR_CQ = 1 << 3, + IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 +}; + +struct ibv_srq_init_attr_ex { + void *srq_context; + struct ibv_srq_attr attr; + + uint32_t comp_mask; + enum ibv_srq_type srq_type; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; + struct ibv_cq *cq; +}; + enum ibv_qp_type { IBV_QPT_RC = 2, IBV_QPT_UC, IBV_QPT_UD, + /* XRC compatible code */ IBV_QPT_XRC, - IBV_QPT_RAW_ETH = 8 + IBV_QPT_RAW_PACKET = 8, + IBV_QPT_RAW_ETH = 8, + IBV_QPT_XRC_SEND = 9, + IBV_QPT_XRC_RECV, + + /* Leave a gap for future qp types before starting with + * experimental qp types. + */ + IBV_EXP_QP_TYPE_START = 32, + IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START }; struct ibv_qp_cap { @@ -431,9 +562,46 @@ struct ibv_qp_cap cap; enum ibv_qp_type qp_type; int sq_sig_all; + /* Below is needed for backwards compatabile */ struct ibv_xrc_domain *xrc_domain; }; +enum ibv_qp_init_attr_mask { + IBV_QP_INIT_ATTR_PD = 1 << 0, + IBV_QP_INIT_ATTR_XRCD = 1 << 1, + IBV_QP_INIT_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_qp_init_attr_ex { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; + + uint32_t comp_mask; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; +}; + +enum ibv_qp_open_attr_mask { + IBV_QP_OPEN_ATTR_NUM = 1 << 0, + IBV_QP_OPEN_ATTR_XRCD = 1 << 1, + IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, + IBV_QP_OPEN_ATTR_TYPE = 1 << 3, + IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 +}; + +struct ibv_qp_open_attr { + uint32_t comp_mask; + uint32_t qp_num; + struct ibv_xrcd *xrcd; + void *qp_context; + enum ibv_qp_type qp_type; +}; + enum ibv_qp_attr_mask { IBV_QP_STATE = 1 << 0, IBV_QP_CUR_STATE = 1 << 1, @@ -465,7 +633,8 @@ IBV_QPS_RTS, IBV_QPS_SQD, IBV_QPS_SQE, - IBV_QPS_ERR + IBV_QPS_ERR, + IBV_QPS_UNKNOWN }; enum ibv_mig_state { @@ -550,7 +719,15 @@ uint32_t remote_qkey; } ud; } wr; - uint32_t xrc_remote_srq_num; + union { + union { + struct { + uint32_t remote_srqn; + } xrc; + } qp_type; + + uint32_t xrc_remote_srq_num; + }; }; struct ibv_recv_wr { @@ -575,16 +752,31 @@ struct ibv_pd *pd; uint32_t handle; + pthread_mutex_t mutex; + pthread_cond_t cond; uint32_t events_completed; + /* below are for source compatabilty with legacy XRC, + * padding based on ibv_srq_legacy. + */ + uint32_t xrc_srq_num_bin_compat_padding; + struct ibv_xrc_domain *xrc_domain_bin_compat_padding; + struct ibv_cq *xrc_cq_bin_compat_padding; + void *ibv_srq_padding; + + /* legacy fields */ uint32_t xrc_srq_num; - struct ibv_xrc_domain *xrc_domain; - struct ibv_cq *xrc_cq; + struct ibv_xrc_domain *xrc_domain; + struct ibv_cq *xrc_cq; +}; - pthread_mutex_t mutex; - pthread_cond_t cond; +/* Not in use in new API, needed for compilation as part of source compat layer */ +enum ibv_event_flags { + IBV_XRC_QP_EVENT_FLAG = 0x80000000, }; + + struct ibv_qp { struct ibv_context *context; void *qp_context; @@ -597,12 +789,9 @@ enum ibv_qp_state state; enum ibv_qp_type qp_type; - uint32_t events_completed; - - struct ibv_xrc_domain *xrc_domain; - pthread_mutex_t mutex; pthread_cond_t cond; + uint32_t events_completed; }; struct ibv_comp_channel { @@ -618,11 +807,10 @@ uint32_t handle; int cqe; - uint32_t comp_events_completed; - uint32_t async_events_completed; - pthread_mutex_t mutex; pthread_cond_t cond; + uint32_t comp_events_completed; + uint32_t async_events_completed; }; struct ibv_ah { @@ -631,6 +819,103 @@ uint32_t handle; }; +enum ibv_flow_flags { + IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, +}; + +enum ibv_flow_attr_type { + /* steering according to rule specifications */ + IBV_FLOW_ATTR_NORMAL = 0x0, + /* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, + /* default multicast rule - + * receive all Eth multicast traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_MC_DEFAULT = 0x2, +}; + +enum ibv_flow_spec_type { + IBV_FLOW_SPEC_ETH = 0x20, + IBV_FLOW_SPEC_IPV4 = 0x30, + IBV_FLOW_SPEC_TCP = 0x40, + IBV_FLOW_SPEC_UDP = 0x41, +}; + +struct ibv_flow_eth_filter { + uint8_t dst_mac[6]; + uint8_t src_mac[6]; + uint16_t ether_type; + /* + * same layout as 802.1q: prio 3, cfi 1, vlan id 12 + */ + uint16_t vlan_tag; +}; + +struct ibv_flow_spec_eth { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_eth_filter val; + struct ibv_flow_eth_filter mask; +}; + +struct ibv_flow_ipv4_filter { + uint32_t src_ip; + uint32_t dst_ip; +}; + +struct ibv_flow_spec_ipv4 { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_ipv4_filter val; + struct ibv_flow_ipv4_filter mask; +}; + +struct ibv_flow_tcp_udp_filter { + uint16_t dst_port; + uint16_t src_port; +}; + +struct ibv_flow_spec_tcp_udp { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_tcp_udp_filter val; + struct ibv_flow_tcp_udp_filter mask; +}; + +struct ibv_flow_spec { + union { + struct { + enum ibv_flow_spec_type type; + uint16_t size; + } hdr; + struct ibv_flow_spec_eth eth; + struct ibv_flow_spec_ipv4 ipv4; + struct ibv_flow_spec_tcp_udp tcp_udp; + }; +}; + +struct ibv_flow_attr { + uint32_t comp_mask; + enum ibv_flow_attr_type type; + uint16_t size; + uint16_t priority; + uint8_t num_of_specs; + uint8_t port; + uint32_t flags; + /* Following are the optional layers according to user request + * struct ibv_flow_spec_xxx [L2] + * struct ibv_flow_spec_yyy [L3/L4] + */ +}; + +struct ibv_flow { + uint32_t comp_mask; + struct ibv_context *context; + uint32_t handle; +}; + struct ibv_device; struct ibv_context; @@ -658,30 +943,15 @@ char ibdev_path[IBV_SYSFS_PATH_MAX]; }; -struct ibv_more_ops { - struct ibv_srq * (*create_xrc_srq)(struct ibv_pd *pd, - struct ibv_xrc_domain *xrc_domain, - struct ibv_cq *xrc_cq, - struct ibv_srq_init_attr *srq_init_attr); - struct ibv_xrc_domain * (*open_xrc_domain)(struct ibv_context *context, - int fd, int oflag); - int (*close_xrc_domain)(struct ibv_xrc_domain *d); - int (*create_xrc_rcv_qp)(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_qp_num); - int (*modify_xrc_rcv_qp)(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, - int attr_mask); - int (*query_xrc_rcv_qp)(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, - int attr_mask, - struct ibv_qp_init_attr *init_attr); - int (*reg_xrc_rcv_qp)(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num); - int (*unreg_xrc_rcv_qp)(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num); - +struct verbs_device { + struct ibv_device device; /* Must be first */ + size_t sz; + size_t size_of_context; + int (*init_context)(struct verbs_device *device, + struct ibv_context *ctx, int cmd_fd); + void (*uninit_context)(struct verbs_device *device, + struct ibv_context *ctx); + /* future fields added here */ }; struct ibv_context_ops { @@ -750,17 +1020,59 @@ int num_comp_vectors; pthread_mutex_t mutex; void *abi_compat; - struct ibv_more_ops *more_ops; }; -static inline int ___ibv_query_port(struct ibv_context *context, - uint8_t port_num, - struct ibv_port_attr *port_attr) +enum verbs_context_mask { + VERBS_CONTEXT_XRCD = (uint64_t)1 << 0, + VERBS_CONTEXT_SRQ = (uint64_t)1 << 1, + VERBS_CONTEXT_QP = (uint64_t)1 << 2, + VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3, + VERBS_CONTEXT_EXP = (uint64_t)1 << 62 +}; + +struct verbs_context { + /* "grows up" - new fields go here */ + int (*_reserved_2) (void); + int (*destroy_flow) (struct ibv_flow *flow); + int (*_reserved_1) (void); + struct ibv_flow * (*create_flow) (struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr); + struct ibv_qp * (*open_qp)(struct ibv_context *context, + struct ibv_qp_open_attr *attr); + struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex); + int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); + struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex); + struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); + int (*close_xrcd)(struct ibv_xrcd *xrcd); + uint64_t has_comp_mask; + size_t sz; /* Must be immediately before struct ibv_context */ + struct ibv_context context;/* Must be last field in the struct */ +}; + +static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) { - port_attr->link_layer = IBV_LINK_LAYER_UNSPECIFIED; - port_attr->pad = 0; + return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? + NULL : container_of(ctx, struct verbs_context, context); +} + +#define verbs_get_ctx_op(ctx, op) ({ \ + struct verbs_context *_vctx = verbs_get_ctx(ctx); \ + (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ + !_vctx->op) ? NULL : _vctx; }) + +#define verbs_set_ctx_op(_vctx, op, ptr) ({ \ + struct verbs_context *vctx = _vctx; \ + if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ + vctx->op = ptr; }) - return context->ops.query_port(context, port_num, port_attr); +static inline struct verbs_device *verbs_get_device( + const struct ibv_device *dev) +{ + return (dev->ops.alloc_context) ? + NULL : container_of(dev, struct verbs_device, device); } /** @@ -837,6 +1149,20 @@ int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); +static inline int ___ibv_query_port(struct ibv_context *context, + uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + /* For compatibility when running with old libibverbs */ + port_attr->link_layer = IBV_LINK_LAYER_UNSPECIFIED; + port_attr->reserved = 0; + + return ibv_query_port(context, port_num, port_attr); +} + +#define ibv_query_port(context, port_num, port_attr) \ + ___ibv_query_port(context, port_num, port_attr) + /** * ibv_query_gid - Get a GID table entry */ @@ -859,6 +1185,49 @@ */ int ibv_dealloc_pd(struct ibv_pd *pd); +static inline struct ibv_flow *ibv_create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *flow) +{ + struct verbs_context *vctx = verbs_get_ctx_op(qp->context, + create_flow); + if (!vctx) + return NULL; + + return vctx->create_flow(qp, flow); +} + +static inline int ibv_destroy_flow(struct ibv_flow *flow_id) +{ + struct verbs_context *vctx = verbs_get_ctx_op(flow_id->context, + destroy_flow); + if (!vctx) + return -ENOSYS; + return vctx->destroy_flow(flow_id); +} + +/** + * ibv_open_xrcd - Open an extended connection domain + */ +static inline struct ibv_xrcd * +ibv_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, open_xrcd); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + return vctx->open_xrcd(context, xrcd_init_attr); +} + +/** + * ibv_close_xrcd - Close an extended connection domain + */ +static inline int ibv_close_xrcd(struct ibv_xrcd *xrcd) +{ + struct verbs_context *vctx = verbs_get_ctx(xrcd->context); + return vctx->close_xrcd(xrcd); +} + /** * ibv_reg_mr - Register a memory region */ @@ -871,6 +1240,43 @@ int ibv_dereg_mr(struct ibv_mr *mr); /** + * ibv_alloc_mw - Allocate a memory window + */ +static inline struct ibv_mw *ibv_alloc_mw(struct ibv_pd *pd, + enum ibv_mw_type type) +{ + if (!pd->context->ops.alloc_mw) { + errno = ENOSYS; + return NULL; + } + + struct ibv_mw *mw = pd->context->ops.alloc_mw(pd, type); + if (mw) { + mw->context = pd->context; + mw->pd = pd; + } + return mw; +} + +/** + * ibv_dealloc_mw - Free a memory window + */ +static inline int ibv_dealloc_mw(struct ibv_mw *mw) +{ + return mw->context->ops.dealloc_mw(mw); +} + +/** + * ibv_inc_rkey - increase the 8 lsb in the given rkey + */ +static inline uint32_t ibv_inc_rkey(uint32_t rkey) +{ + const uint32_t mask = 0x000000ff; + uint8_t newtag = (uint8_t) ((rkey + 1) & mask); + return (rkey & ~mask) | newtag; +} + +/** * ibv_create_comp_channel - Create a completion event channel */ struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context); @@ -982,24 +1388,27 @@ struct ibv_srq *ibv_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr); -/** - * ibv_create_xrc_srq - Creates a SRQ associated with the specified protection - * domain and xrc domain. - * @pd: The protection domain associated with the SRQ. - * @xrc_domain: The XRC domain associated with the SRQ. - * @xrc_cq: CQ to report completions for XRC packets on. - * - * @srq_init_attr: A list of initial attributes required to create the SRQ. - * - * srq_attr->max_wr and srq_attr->max_sge are read the determine the - * requested size of the SRQ, and set to the actual values allocated - * on return. If ibv_create_srq() succeeds, then max_wr and max_sge - * will always be at least as large as the requested values. - */ -struct ibv_srq *ibv_create_xrc_srq(struct ibv_pd *pd, - struct ibv_xrc_domain *xrc_domain, - struct ibv_cq *xrc_cq, - struct ibv_srq_init_attr *srq_init_attr); +static inline struct ibv_srq * +ibv_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex) +{ + struct verbs_context *vctx; + uint32_t mask = srq_init_attr_ex->comp_mask; + + if (!(mask & ~(IBV_SRQ_INIT_ATTR_PD | IBV_SRQ_INIT_ATTR_TYPE)) && + (mask & IBV_SRQ_INIT_ATTR_PD) && + (!(mask & IBV_SRQ_INIT_ATTR_TYPE) || + (srq_init_attr_ex->srq_type == IBV_SRQT_BASIC))) + return ibv_create_srq(srq_init_attr_ex->pd, + (struct ibv_srq_init_attr *) srq_init_attr_ex); + + vctx = verbs_get_ctx_op(context, create_srq_ex); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + return vctx->create_srq_ex(context, srq_init_attr_ex); +} /** * ibv_modify_srq - Modifies the attributes for the specified SRQ. @@ -1025,6 +1434,16 @@ */ int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); +static inline int ibv_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) +{ + struct verbs_context *vctx = verbs_get_ctx_op(srq->context, get_srq_num); + + if (!vctx) + return ENOSYS; + + return vctx->get_srq_num(srq, srq_num); +} + /** * ibv_destroy_srq - Destroys the specified SRQ. * @srq: The SRQ to destroy. @@ -1051,6 +1470,38 @@ struct ibv_qp *ibv_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); +static inline struct ibv_qp * +ibv_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex) +{ + struct verbs_context *vctx; + uint32_t mask = qp_init_attr_ex->comp_mask; + + if (mask == IBV_QP_INIT_ATTR_PD) + return ibv_create_qp(qp_init_attr_ex->pd, + (struct ibv_qp_init_attr *) qp_init_attr_ex); + + vctx = verbs_get_ctx_op(context, create_qp_ex); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + return vctx->create_qp_ex(context, qp_init_attr_ex); +} + +/** + * ibv_open_qp - Open a shareable queue pair. + */ +static inline struct ibv_qp * +ibv_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *qp_open_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, open_qp); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + return vctx->open_qp(context, qp_open_attr); +} + /** * ibv_modify_qp - Modify a queue pair. */ @@ -1182,149 +1633,10 @@ */ const char *ibv_event_type_str(enum ibv_event_type event); -/** - * ibv_open_xrc_domain - open an XRC domain - * Returns a reference to an XRC domain. - * - * @context: Device context - * @fd: descriptor for inode associated with the domain - * If fd == -1, no inode is associated with the domain; in this ca= se, - * the only legal value for oflag is O_CREAT - * - * @oflag: oflag values are constructed by OR-ing flags from the following list - * - * O_CREAT - * If a domain belonging to device named by context is already associated - * with the inode, this flag has no effect, except as noted under O_EXCL - * below. Otherwise, a new XRC domain is created and is associated with - * inode specified by fd. - * - * O_EXCL - * If O_EXCL and O_CREAT are set, open will fail if a domain associated with - * the inode exists. The check for the existence of the domain and creation - * of the domain if it does not exist is atomic with respect to other - * processes executing open with fd naming the same inode. - */ -struct ibv_xrc_domain *ibv_open_xrc_domain(struct ibv_context *context, - int fd, int oflag); - -/** - * ibv_close_xrc_domain - close an XRC domain - * If this is the last reference, destroys the domain. - * - * @d: reference to XRC domain to close - * - * close is implicitly performed at process exit. - */ -int ibv_close_xrc_domain(struct ibv_xrc_domain *d); - -/** - * ibv_create_xrc_rcv_qp - creates an XRC QP for serving as a receive-side-only QP, - * - * This QP is created in kernel space, and persists until the last process - * registered for the QP calls ibv_unreg_xrc_rcv_qp() (at which time the QP - * is destroyed). - * - * @init_attr: init attributes to use for QP. xrc domain MUST be included here. - * All other fields are ignored. - * - * @xrc_rcv_qpn: qp_num of created QP (if success). To be passed to the - * remote node (sender). The remote node will use xrc_rcv_qpn - * in ibv_post_send when sending to XRC SRQ's on this host - * in the same xrc domain. - * - * RETURNS: success (0), or a (negative) error value. - * - * NOTE: this verb also registers the calling user-process with the QP at its - * creation time (implicit call to ibv_reg_xrc_rcv_qp), to avoid race - * conditions. The creating process will need to call ibv_unreg_xrc_qp() - * for the QP to release it from this process. - */ -int ibv_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_rcv_qpn); - -/** - * ibv_modify_xrc_rcv_qp - modifies an xrc_rcv qp. - * - * @xrc_domain: xrc domain the QP belongs to (for verification). - * @xrc_qp_num: The (24 bit) number of the XRC QP. - * @attr: modify-qp attributes. The following fields must be specified: - * for RESET_2_INIT: qp_state, pkey_index , port, qp_access_flags - * for INIT_2_RTR: qp_state, path_mtu, dest_qp_num, rq_psn, - * max_dest_rd_atomic, min_rnr_timer, ah_attr - * The QP need not be brought to RTS for the QP to operate as a - * receive-only QP. - * @attr_mask: bitmap indicating which attributes are provided in the attr - * struct. Used for validity checking. - * The following bits must be set: - * for RESET_2_INIT: IBV_QP_PKEY_INDEX, IBV_QP_PORT, - * IBV_QP_ACCESS_FLAGS, IBV_QP_STATE - * for INIT_2_RTR: IBV_QP_AV, IBV_QP_PATH_MTU, IBV_QP_DEST_QPN, - * IBV_QP_RQ_PSN, IBV_QP_MAX_DEST_RD_ATOMIC, - * IBV_QP_MIN_RNR_TIMER, IBV_QP_STATE - * - * RETURNS: success (0), or a (positive) error value. - * - */ -int ibv_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, int attr_mask); - -/** - * ibv_query_xrc_rcv_qp - queries an xrc_rcv qp. - * - * @xrc_domain: xrc domain the QP belongs to (for verification). - * @xrc_qp_num: The (24 bit) number of the XRC QP. - * @attr: for returning qp attributes. - * @attr_mask: bitmap indicating which attributes to return. - * @init_attr: for returning the init attributes - * - * RETURNS: success (0), or a (positive) error value. - * - */ -int ibv_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, int attr_mask, - struct ibv_qp_init_attr *init_attr); - -/** - * ibv_reg_xrc_rcv_qp: registers a user process with an XRC QP which serves as - * a receive-side only QP. - * - * @xrc_domain: xrc domain the QP belongs to (for verification). - * @xrc_qp_num: The (24 bit) number of the XRC QP. - * - * RETURNS: success (0), - * or error (EINVAL), if: - * 1. There is no such QP_num allocated. - * 2. The QP is allocated, but is not an receive XRC QP - * 3. The XRC QP does not belong to the given domain. - */ -int ibv_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num); - -/** - * ibv_unreg_xrc_rcv_qp: detaches a user process from an XRC QP serving as - * a receive-side only QP. If as a result, there are no remaining - * userspace processes registered for this XRC QP, it is destroyed. - * - * @xrc_domain: xrc domain the QP belongs to (for verification). - * @xrc_qp_num: The (24 bit) number of the XRC QP. - * - * RETURNS: success (0), - * or error (EINVAL), if: - * 1. There is no such QP_num allocated. - * 2. The QP is allocated, but is not an XRC QP - * 3. The XRC QP does not belong to the given domain. - * NOTE: There is no reason to return a special code if the QP is destroyed. - * The unregister simply succeeds. - */ -int ibv_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num); - END_C_DECLS # undef __attribute_const -#define ibv_query_port(context, port_num, port_attr) \ - ___ibv_query_port(context, port_num, port_attr) +#include #endif /* INFINIBAND_VERBS_H */ Index: contrib/ofed/libibverbs/include/infiniband/verbs_exp.h =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/include/infiniband/verbs_exp.h @@ -0,0 +1,2820 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_VERBS_EXP_H +#define INFINIBAND_VERBS_EXP_H + +#include +#include +#include + +#if __GNUC__ >= 3 +# define __attribute_const __attribute__((const)) +#else +# define __attribute_const +#endif + +BEGIN_C_DECLS + +#define IBV_EXP_RET_ON_INVALID_COMP_MASK(val, valid_mask, ret) \ + if ((val) > (valid_mask)) { \ + fprintf(stderr, "%s: invalid comp_mask !!! (comp_mask = 0x%x valid_mask = 0x%x)\n", \ + __FUNCTION__, val, valid_mask); \ + errno = EINVAL; \ + return ret; \ + } + +#define IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(val, valid_mask) \ + IBV_EXP_RET_ON_INVALID_COMP_MASK(val, valid_mask, NULL) + +#define IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(val, valid_mask) \ + IBV_EXP_RET_ON_INVALID_COMP_MASK(val, valid_mask, EINVAL) + + +#define IBV_EXP_IMPLICIT_MR_SIZE (~((size_t)0)) + +enum ibv_exp_func_name { + IBV_EXP_POST_SEND_FUNC, + IBV_EXP_POLL_CQ_FUNC, + IBV_POST_SEND_FUNC, + IBV_POLL_CQ_FUNC, + IBV_POST_RECV_FUNC +}; + +enum ibv_exp_start_values { + IBV_EXP_START_ENUM = 0x40, + IBV_EXP_START_FLAG_LOC = 0x20, + IBV_EXP_START_FLAG = (1ULL << IBV_EXP_START_FLAG_LOC), +}; + +/* + * Capabilities for exp_atomic_cap field in ibv_exp_device_attr struct + */ +enum ibv_exp_atomic_cap { + IBV_EXP_ATOMIC_NONE = IBV_ATOMIC_NONE, + IBV_EXP_ATOMIC_HCA = IBV_ATOMIC_HCA, + IBV_EXP_ATOMIC_GLOB = IBV_ATOMIC_GLOB, + + IBV_EXP_ATOMIC_HCA_REPLY_BE = IBV_EXP_START_ENUM /* HOST is LE and atomic reply is BE */ +}; + +/* + * Flags for exp_device_cap_flags field in ibv_exp_device_attr struct + */ +enum ibv_exp_device_cap_flags { + IBV_EXP_DEVICE_RESIZE_MAX_WR = IBV_DEVICE_RESIZE_MAX_WR, + IBV_EXP_DEVICE_BAD_PKEY_CNTR = IBV_DEVICE_BAD_PKEY_CNTR, + IBV_EXP_DEVICE_BAD_QKEY_CNTR = IBV_DEVICE_BAD_QKEY_CNTR, + IBV_EXP_DEVICE_RAW_MULTI = IBV_DEVICE_RAW_MULTI, + IBV_EXP_DEVICE_AUTO_PATH_MIG = IBV_DEVICE_AUTO_PATH_MIG, + IBV_EXP_DEVICE_CHANGE_PHY_PORT = IBV_DEVICE_CHANGE_PHY_PORT, + IBV_EXP_DEVICE_UD_AV_PORT_ENFORCE = IBV_DEVICE_UD_AV_PORT_ENFORCE, + IBV_EXP_DEVICE_CURR_QP_STATE_MOD = IBV_DEVICE_CURR_QP_STATE_MOD, + IBV_EXP_DEVICE_SHUTDOWN_PORT = IBV_DEVICE_SHUTDOWN_PORT, + IBV_EXP_DEVICE_INIT_TYPE = IBV_DEVICE_INIT_TYPE, + IBV_EXP_DEVICE_PORT_ACTIVE_EVENT = IBV_DEVICE_PORT_ACTIVE_EVENT, + IBV_EXP_DEVICE_SYS_IMAGE_GUID = IBV_DEVICE_SYS_IMAGE_GUID, + IBV_EXP_DEVICE_RC_RNR_NAK_GEN = IBV_DEVICE_RC_RNR_NAK_GEN, + IBV_EXP_DEVICE_SRQ_RESIZE = IBV_DEVICE_SRQ_RESIZE, + IBV_EXP_DEVICE_N_NOTIFY_CQ = IBV_DEVICE_N_NOTIFY_CQ, + IBV_EXP_DEVICE_XRC = IBV_DEVICE_XRC, + + IBV_EXP_DEVICE_DC_TRANSPORT = (IBV_EXP_START_FLAG << 0), + IBV_EXP_DEVICE_QPG = (IBV_EXP_START_FLAG << 1), + IBV_EXP_DEVICE_UD_RSS = (IBV_EXP_START_FLAG << 2), + IBV_EXP_DEVICE_UD_TSS = (IBV_EXP_START_FLAG << 3), + IBV_EXP_DEVICE_EXT_ATOMICS = (IBV_EXP_START_FLAG << 4), + IBV_EXP_DEVICE_NOP = (IBV_EXP_START_FLAG << 5), + IBV_EXP_DEVICE_UMR = (IBV_EXP_START_FLAG << 6), + IBV_EXP_DEVICE_ODP = (IBV_EXP_START_FLAG << 7), + IBV_EXP_DEVICE_VXLAN_SUPPORT = (IBV_EXP_START_FLAG << 10), + IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT = (IBV_EXP_START_FLAG << 11), + IBV_EXP_DEVICE_RX_CSUM_IP_PKT = (IBV_EXP_START_FLAG << 12), + IBV_EXP_DEVICE_MEM_WINDOW = (IBV_EXP_START_FLAG << 17), + IBV_EXP_DEVICE_MEM_MGT_EXTENSIONS = (IBV_EXP_START_FLAG << 21), + IBV_EXP_DEVICE_DC_INFO = (IBV_EXP_START_FLAG << 22), + /* Jumping to 23 as of next capability in include/rdma/ib_verbs.h */ + IBV_EXP_DEVICE_MW_TYPE_2A = (IBV_EXP_START_FLAG << 23), + IBV_EXP_DEVICE_MW_TYPE_2B = (IBV_EXP_START_FLAG << 24), + IBV_EXP_DEVICE_CROSS_CHANNEL = (IBV_EXP_START_FLAG << 28), + IBV_EXP_DEVICE_MANAGED_FLOW_STEERING = (IBV_EXP_START_FLAG << 29), + IBV_EXP_DEVICE_MR_ALLOCATE = (IBV_EXP_START_FLAG << 30), + IBV_EXP_DEVICE_SHARED_MR = (IBV_EXP_START_FLAG << 31), +}; + +/* + * Flags for ibv_exp_device_attr struct comp_mask. + */ +enum ibv_exp_device_attr_comp_mask { + IBV_EXP_DEVICE_ATTR_CALC_CAP = (1 << 0), + IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK = (1 << 1), + IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = (1 << 2), + IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS = (1 << 3), + IBV_EXP_DEVICE_DC_RD_REQ = (1 << 4), + IBV_EXP_DEVICE_DC_RD_RES = (1 << 5), + IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ = (1 << 6), + IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ = (1 << 7), + IBV_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS = (1 << 8), + IBV_EXP_DEVICE_ATTR_UMR = (1 << 9), + IBV_EXP_DEVICE_ATTR_ODP = (1 << 10), + IBV_EXP_DEVICE_ATTR_MAX_DCT = (1 << 11), + IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN = (1 << 12), + IBV_EXP_DEVICE_ATTR_RX_HASH = (1 << 13), + IBV_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ = (1 << 14), + IBV_EXP_DEVICE_ATTR_MAX_DEVICE_CTX = (1 << 15), + IBV_EXP_DEVICE_ATTR_MP_RQ = (1 << 16), + /* set supported bits for validity check */ + IBV_EXP_DEVICE_ATTR_RESERVED = (1 << 17), +}; + +struct ibv_exp_device_calc_cap { + uint64_t data_types; + uint64_t data_sizes; + uint64_t int_ops; + uint64_t uint_ops; + uint64_t fp_ops; +}; + +struct ibv_exp_ext_atomics_params { + uint64_t log_atomic_arg_sizes; /* bit-mask of supported sizes */ + uint32_t max_fa_bit_boundary; + uint32_t log_max_atomic_inline; +}; + +enum ibv_odp_general_cap_bits { + IBV_EXP_ODP_SUPPORT = 1 << 0, +}; + +enum ibv_odp_transport_cap_bits { + IBV_EXP_ODP_SUPPORT_SEND = 1 << 0, + IBV_EXP_ODP_SUPPORT_RECV = 1 << 1, + IBV_EXP_ODP_SUPPORT_WRITE = 1 << 2, + IBV_EXP_ODP_SUPPORT_READ = 1 << 3, + IBV_EXP_ODP_SUPPORT_ATOMIC = 1 << 4, + IBV_EXP_ODP_SUPPORT_SRQ_RECV = 1 << 5, +}; + +struct ibv_exp_umr_caps { + uint32_t max_klm_list_size; + uint32_t max_send_wqe_inline_klms; + uint32_t max_umr_recursion_depth; + uint32_t max_umr_stride_dimension; +}; + +struct ibv_exp_odp_caps { + uint64_t general_odp_caps; + struct { + uint32_t rc_odp_caps; + uint32_t uc_odp_caps; + uint32_t ud_odp_caps; + uint32_t dc_odp_caps; + uint32_t xrc_odp_caps; + uint32_t raw_eth_odp_caps; + } per_transport_caps; +}; + +enum ibv_exp_supported_qp_types { + IBV_EXP_QPT_RC = 1ULL << 0, + IBV_EXP_QPT_UC = 1ULL << 1, + IBV_EXP_QPT_UD = 1ULL << 2, + IBV_EXP_QPT_XRC_INIT = 1ULL << 3, + IBV_EXP_QPT_XRC_TGT = 1ULL << 4, + IBV_EXP_QPT_RAW_PACKET = 1ULL << 5, + IBV_EXP_QPT_RESERVED = 1ULL << 6 +}; + +struct ibv_exp_rx_hash_caps { + uint32_t max_rwq_indirection_tables; + uint32_t max_rwq_indirection_table_size; + uint8_t supported_hash_functions; /* from ibv_exp_rx_hash_function_flags */ + uint64_t supported_packet_fields; /* from ibv_exp_rx_hash_fields */ + uint32_t supported_qps; /* from ibv_exp_supported_qp_types */ +}; + +enum ibv_exp_mp_rq_shifts { + IBV_EXP_MP_RQ_NO_SHIFT = 0, + IBV_EXP_MP_RQ_2BYTES_SHIFT = 1 << 0 +}; + +struct ibv_exp_mp_rq_caps { + uint32_t supported_qps; /* use ibv_exp_supported_qp_types */ + uint32_t allowed_shifts; /* use ibv_exp_mp_rq_shifts */ + uint8_t min_single_wqe_log_num_of_strides; + uint8_t max_single_wqe_log_num_of_strides; + uint8_t min_single_stride_log_num_of_bytes; + uint8_t max_single_stride_log_num_of_bytes; +}; + +struct ibv_exp_device_attr { + char fw_ver[64]; + uint64_t node_guid; + uint64_t sys_image_guid; + uint64_t max_mr_size; + uint64_t page_size_cap; + uint32_t vendor_id; + uint32_t vendor_part_id; + uint32_t hw_ver; + int max_qp; + int max_qp_wr; + int reserved; /* place holder to align with ibv_device_attr */ + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ibv_exp_atomic_cap exp_atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + uint16_t max_pkeys; + uint8_t local_ca_ack_delay; + uint8_t phys_port_cnt; + uint32_t comp_mask; + struct ibv_exp_device_calc_cap calc_cap; + uint64_t timestamp_mask; + uint64_t hca_core_clock; + uint64_t exp_device_cap_flags; /* use ibv_exp_device_cap_flags */ + int max_dc_req_rd_atom; + int max_dc_res_rd_atom; + int inline_recv_sz; + uint32_t max_rss_tbl_sz; + struct ibv_exp_ext_atomics_params ext_atom; + struct ibv_exp_umr_caps umr_caps; + struct ibv_exp_odp_caps odp_caps; + int max_dct; + int max_ctx_res_domain; + struct ibv_exp_rx_hash_caps rx_hash_caps; + uint32_t max_wq_type_rq; + int max_device_ctx; + struct ibv_exp_mp_rq_caps mp_rq_caps; +}; + +enum ibv_exp_access_flags { + IBV_EXP_ACCESS_LOCAL_WRITE = IBV_ACCESS_LOCAL_WRITE, + IBV_EXP_ACCESS_REMOTE_WRITE = IBV_ACCESS_REMOTE_WRITE, + IBV_EXP_ACCESS_REMOTE_READ = IBV_ACCESS_REMOTE_READ, + IBV_EXP_ACCESS_REMOTE_ATOMIC = IBV_ACCESS_REMOTE_ATOMIC, + IBV_EXP_ACCESS_MW_BIND = IBV_ACCESS_MW_BIND, + + IBV_EXP_ACCESS_ALLOCATE_MR = (IBV_EXP_START_FLAG << 5), + IBV_EXP_ACCESS_SHARED_MR_USER_READ = (IBV_EXP_START_FLAG << 6), + IBV_EXP_ACCESS_SHARED_MR_USER_WRITE = (IBV_EXP_START_FLAG << 7), + IBV_EXP_ACCESS_SHARED_MR_GROUP_READ = (IBV_EXP_START_FLAG << 8), + IBV_EXP_ACCESS_SHARED_MR_GROUP_WRITE = (IBV_EXP_START_FLAG << 9), + IBV_EXP_ACCESS_SHARED_MR_OTHER_READ = (IBV_EXP_START_FLAG << 10), + IBV_EXP_ACCESS_SHARED_MR_OTHER_WRITE = (IBV_EXP_START_FLAG << 11), + IBV_EXP_ACCESS_NO_RDMA = (IBV_EXP_START_FLAG << 12), + IBV_EXP_ACCESS_MW_ZERO_BASED = (IBV_EXP_START_FLAG << 13), + IBV_EXP_ACCESS_ON_DEMAND = (IBV_EXP_START_FLAG << 14), + IBV_EXP_ACCESS_RELAXED = (IBV_EXP_START_FLAG << 15), + /* set supported bits for validity check */ + IBV_EXP_ACCESS_RESERVED = (IBV_EXP_START_FLAG << 16) +}; + +/* memory window information struct that is common to types 1 and 2 */ +struct ibv_exp_mw_bind_info { + struct ibv_mr *mr; + uint64_t addr; + uint64_t length; + uint64_t exp_mw_access_flags; /* use ibv_exp_access_flags */ +}; + +/* + * Flags for ibv_exp_mw_bind struct comp_mask + */ +enum ibv_exp_bind_mw_comp_mask { + IBV_EXP_BIND_MW_RESERVED = (1 << 0) +}; + +/* type 1 specific info */ +struct ibv_exp_mw_bind { + struct ibv_qp *qp; + struct ibv_mw *mw; + uint64_t wr_id; + uint64_t exp_send_flags; /* use ibv_exp_send_flags */ + struct ibv_exp_mw_bind_info bind_info; + uint32_t comp_mask; /* reserved for future growth (must be 0) */ +}; + +enum ibv_exp_calc_op { + IBV_EXP_CALC_OP_ADD = 0, + IBV_EXP_CALC_OP_MAXLOC, + IBV_EXP_CALC_OP_BAND, + IBV_EXP_CALC_OP_BXOR, + IBV_EXP_CALC_OP_BOR, + IBV_EXP_CALC_OP_NUMBER +}; + +enum ibv_exp_calc_data_type { + IBV_EXP_CALC_DATA_TYPE_INT = 0, + IBV_EXP_CALC_DATA_TYPE_UINT, + IBV_EXP_CALC_DATA_TYPE_FLOAT, + IBV_EXP_CALC_DATA_TYPE_NUMBER +}; + +enum ibv_exp_calc_data_size { + IBV_EXP_CALC_DATA_SIZE_64_BIT = 0, + IBV_EXP_CALC_DATA_SIZE_NUMBER +}; + +enum ibv_exp_wr_opcode { + IBV_EXP_WR_RDMA_WRITE = IBV_WR_RDMA_WRITE, + IBV_EXP_WR_RDMA_WRITE_WITH_IMM = IBV_WR_RDMA_WRITE_WITH_IMM, + IBV_EXP_WR_SEND = IBV_WR_SEND, + IBV_EXP_WR_SEND_WITH_IMM = IBV_WR_SEND_WITH_IMM, + IBV_EXP_WR_RDMA_READ = IBV_WR_RDMA_READ, + IBV_EXP_WR_ATOMIC_CMP_AND_SWP = IBV_WR_ATOMIC_CMP_AND_SWP, + IBV_EXP_WR_ATOMIC_FETCH_AND_ADD = IBV_WR_ATOMIC_FETCH_AND_ADD, + + IBV_EXP_WR_SEND_WITH_INV = 8 + IBV_EXP_START_ENUM, + IBV_EXP_WR_LOCAL_INV = 10 + IBV_EXP_START_ENUM, + IBV_EXP_WR_BIND_MW = 14 + IBV_EXP_START_ENUM, + IBV_EXP_WR_SEND_ENABLE = 0x20 + IBV_EXP_START_ENUM, + IBV_EXP_WR_RECV_ENABLE, + IBV_EXP_WR_CQE_WAIT, + IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP, + IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD, + IBV_EXP_WR_NOP, + IBV_EXP_WR_UMR_FILL, + IBV_EXP_WR_UMR_INVALIDATE, +}; + +enum ibv_exp_send_flags { + IBV_EXP_SEND_FENCE = IBV_SEND_FENCE, + IBV_EXP_SEND_SIGNALED = IBV_SEND_SIGNALED, + IBV_EXP_SEND_SOLICITED = IBV_SEND_SOLICITED, + IBV_EXP_SEND_INLINE = IBV_SEND_INLINE, + + IBV_EXP_SEND_IP_CSUM = (IBV_EXP_START_FLAG << 0), + IBV_EXP_SEND_WITH_CALC = (IBV_EXP_START_FLAG << 1), + IBV_EXP_SEND_WAIT_EN_LAST = (IBV_EXP_START_FLAG << 2), + IBV_EXP_SEND_EXT_ATOMIC_INLINE = (IBV_EXP_START_FLAG << 3), +}; + +struct ibv_exp_cmp_swap { + uint64_t compare_mask; + uint64_t compare_val; + uint64_t swap_val; + uint64_t swap_mask; +}; + +struct ibv_exp_fetch_add { + uint64_t add_val; + uint64_t field_boundary; +}; + +/* + * Flags for ibv_exp_send_wr struct comp_mask + */ +enum ibv_exp_send_wr_comp_mask { + IBV_EXP_SEND_WR_ATTR_RESERVED = 1 << 0 +}; + +struct ibv_exp_mem_region { + uint64_t base_addr; + struct ibv_mr *mr; + size_t length; +}; + +struct ibv_exp_mem_repeat_block { + uint64_t base_addr; /* array, size corresponds to ndim */ + struct ibv_mr *mr; + size_t *byte_count; /* array, size corresponds to ndim */ + size_t *stride; /* array, size corresponds to ndim */ +}; + +enum ibv_exp_umr_wr_type { + IBV_EXP_UMR_MR_LIST, + IBV_EXP_UMR_REPEAT +}; + +struct ibv_exp_send_wr { + uint64_t wr_id; + struct ibv_exp_send_wr *next; + struct ibv_sge *sg_list; + int num_sge; + enum ibv_exp_wr_opcode exp_opcode; /* use ibv_exp_wr_opcode */ + int reserved; /* place holder to align with ibv_send_wr */ + union { + uint32_t imm_data; /* in network byte order */ + uint32_t invalidate_rkey; + } ex; + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + } ud; + } wr; + union { + union { + struct { + uint32_t remote_srqn; + } xrc; + } qp_type; + + uint32_t xrc_remote_srq_num; + }; + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_cq *cq; + int32_t cq_count; + } cqe_wait; + struct { + struct ibv_qp *qp; + int32_t wqe_count; + } wqe_enable; + } task; + union { + struct { + enum ibv_exp_calc_op calc_op; + enum ibv_exp_calc_data_type data_type; + enum ibv_exp_calc_data_size data_size; + } calc; + } op; + struct { + struct ibv_ah *ah; + uint64_t dct_access_key; + uint32_t dct_number; + } dc; + struct { + struct ibv_mw *mw; + uint32_t rkey; + struct ibv_exp_mw_bind_info bind_info; + } bind_mw; + uint64_t exp_send_flags; /* use ibv_exp_send_flags */ + uint32_t comp_mask; /* reserved for future growth (must be 0) */ + union { + struct { + uint32_t umr_type; /* use ibv_exp_umr_wr_type */ + struct ibv_exp_mkey_list_container *memory_objects; /* used when IBV_EXP_SEND_INLINE is not set */ + uint64_t exp_access; /* use ibv_exp_access_flags */ + struct ibv_mr *modified_mr; + uint64_t base_addr; + uint32_t num_mrs; /* array size of mem_repeat_block_list or mem_reg_list */ + union { + struct ibv_exp_mem_region *mem_reg_list; /* array, size corresponds to num_mrs */ + struct { + struct ibv_exp_mem_repeat_block *mem_repeat_block_list; /* array, size corresponds to num_mr */ + size_t *repeat_count; /* array size corresponds to stride_dim */ + uint32_t stride_dim; + } rb; + } mem_list; + } umr; + struct { + uint32_t log_arg_sz; + uint64_t remote_addr; + uint32_t rkey; + union { + struct { + /* For the next four fields: + * If operand_size <= 8 then inline data is immediate + * from the corresponding field; for small opernands, + * ls bits are used. + * Else the fields are pointers in the process's address space + * where arguments are stored + */ + union { + struct ibv_exp_cmp_swap cmp_swap; + struct ibv_exp_fetch_add fetch_add; + } op; + } inline_data; /* IBV_EXP_SEND_EXT_ATOMIC_INLINE is set */ + /* in the future add support for non-inline argument provisioning */ + } wr_data; + } masked_atomics; + } ext_op; +}; + +/* + * Flags for ibv_exp_values struct comp_mask + */ +enum ibv_exp_values_comp_mask { + IBV_EXP_VALUES_HW_CLOCK_NS = 1 << 0, + IBV_EXP_VALUES_HW_CLOCK = 1 << 1, + IBV_EXP_VALUES_RESERVED = 1 << 2 +}; + +struct ibv_exp_values { + uint32_t comp_mask; + uint64_t hwclock_ns; + uint64_t hwclock; +}; + +/* + * Flags for flags field in the ibv_exp_cq_init_attr struct + */ +enum ibv_exp_cq_create_flags { + IBV_EXP_CQ_CREATE_CROSS_CHANNEL = 1 << 0, + IBV_EXP_CQ_TIMESTAMP = 1 << 1, + IBV_EXP_CQ_TIMESTAMP_TO_SYS_TIME = 1 << 2, + /* + * note: update IBV_EXP_CQ_CREATE_FLAGS_MASK when adding new fields + */ +}; + +enum { + IBV_EXP_CQ_CREATE_FLAGS_MASK = IBV_EXP_CQ_CREATE_CROSS_CHANNEL | + IBV_EXP_CQ_TIMESTAMP | + IBV_EXP_CQ_TIMESTAMP_TO_SYS_TIME, +}; + +/* + * Flags for ibv_exp_cq_init_attr struct comp_mask + * Set flags only when relevant field is valid + */ +enum ibv_exp_cq_init_attr_mask { + IBV_EXP_CQ_INIT_ATTR_FLAGS = 1 << 0, + IBV_EXP_CQ_INIT_ATTR_RESERVED = 1 << 1, /* This field is kept for backward compatibility + * of application which use the following to set comp_mask: + * cq_init_attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_RESERVED - 1 + * This kind of setting is no longer accepted and application + * may set only valid known fields, for example: + * cq_init_attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_FLAGS | + * IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN + */ + IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN = 1 << 1, + IBV_EXP_CQ_INIT_ATTR_RESERVED1 = 1 << 2, +}; + +struct ibv_exp_res_domain { + struct ibv_context *context; +}; + +struct ibv_exp_cq_init_attr { + uint32_t comp_mask; + uint32_t flags; + struct ibv_exp_res_domain *res_domain; +}; + +/* + * Flags for ibv_exp_ah_attr struct comp_mask + */ +enum ibv_exp_ah_attr_attr_comp_mask { + IBV_EXP_AH_ATTR_LL = 1 << 0, + IBV_EXP_AH_ATTR_VID = 1 << 1, + IBV_EXP_AH_ATTR_RESERVED = 1 << 2 +}; + +enum ll_address_type { + LL_ADDRESS_UNKNOWN, + LL_ADDRESS_IB, + LL_ADDRESS_ETH, + LL_ADDRESS_SIZE +}; + +struct ibv_exp_ah_attr { + struct ibv_global_route grh; + uint16_t dlid; + uint8_t sl; + uint8_t src_path_bits; + uint8_t static_rate; + uint8_t is_global; + uint8_t port_num; + uint32_t comp_mask; + struct { + enum ll_address_type type; + uint32_t len; + char *address; + } ll_address; + uint16_t vid; +}; + +/* + * Flags for exp_attr_mask argument of ibv_exp_modify_qp + */ +enum ibv_exp_qp_attr_mask { + IBV_EXP_QP_STATE = IBV_QP_STATE, + IBV_EXP_QP_CUR_STATE = IBV_QP_CUR_STATE, + IBV_EXP_QP_EN_SQD_ASYNC_NOTIFY = IBV_QP_EN_SQD_ASYNC_NOTIFY, + IBV_EXP_QP_ACCESS_FLAGS = IBV_QP_ACCESS_FLAGS, + IBV_EXP_QP_PKEY_INDEX = IBV_QP_PKEY_INDEX, + IBV_EXP_QP_PORT = IBV_QP_PORT, + IBV_EXP_QP_QKEY = IBV_QP_QKEY, + IBV_EXP_QP_AV = IBV_QP_AV, + IBV_EXP_QP_PATH_MTU = IBV_QP_PATH_MTU, + IBV_EXP_QP_TIMEOUT = IBV_QP_TIMEOUT, + IBV_EXP_QP_RETRY_CNT = IBV_QP_RETRY_CNT, + IBV_EXP_QP_RNR_RETRY = IBV_QP_RNR_RETRY, + IBV_EXP_QP_RQ_PSN = IBV_QP_RQ_PSN, + IBV_EXP_QP_MAX_QP_RD_ATOMIC = IBV_QP_MAX_QP_RD_ATOMIC, + IBV_EXP_QP_ALT_PATH = IBV_QP_ALT_PATH, + IBV_EXP_QP_MIN_RNR_TIMER = IBV_QP_MIN_RNR_TIMER, + IBV_EXP_QP_SQ_PSN = IBV_QP_SQ_PSN, + IBV_EXP_QP_MAX_DEST_RD_ATOMIC = IBV_QP_MAX_DEST_RD_ATOMIC, + IBV_EXP_QP_PATH_MIG_STATE = IBV_QP_PATH_MIG_STATE, + IBV_EXP_QP_CAP = IBV_QP_CAP, + IBV_EXP_QP_DEST_QPN = IBV_QP_DEST_QPN, + + IBV_EXP_QP_GROUP_RSS = IBV_EXP_START_FLAG << 21, + IBV_EXP_QP_DC_KEY = IBV_EXP_START_FLAG << 22, + IBV_EXP_QP_FLOW_ENTROPY = IBV_EXP_START_FLAG << 23, +}; + +/* + * Flags for ibv_exp_qp_attr struct comp_mask + * Set flags only when relevant field is valid + */ +enum ibv_exp_qp_attr_comp_mask { + IBV_EXP_QP_ATTR_FLOW_ENTROPY = 1UL << 0, + IBV_EXP_QP_ATTR_RESERVED = 1UL << 1 +}; + +struct ibv_exp_qp_attr { + enum ibv_qp_state qp_state; + enum ibv_qp_state cur_qp_state; + enum ibv_mtu path_mtu; + enum ibv_mig_state path_mig_state; + uint32_t qkey; + uint32_t rq_psn; + uint32_t sq_psn; + uint32_t dest_qp_num; + int qp_access_flags; /* use ibv_access_flags form verbs.h */ + struct ibv_qp_cap cap; + struct ibv_ah_attr ah_attr; + struct ibv_ah_attr alt_ah_attr; + uint16_t pkey_index; + uint16_t alt_pkey_index; + uint8_t en_sqd_async_notify; + uint8_t sq_draining; + uint8_t max_rd_atomic; + uint8_t max_dest_rd_atomic; + uint8_t min_rnr_timer; + uint8_t port_num; + uint8_t timeout; + uint8_t retry_cnt; + uint8_t rnr_retry; + uint8_t alt_port_num; + uint8_t alt_timeout; + uint64_t dct_key; + uint32_t comp_mask; /* reserved for future growth (must be 0) */ + uint32_t flow_entropy; +}; + +/* + * Flags for ibv_exp_qp_init_attr struct comp_mask + * Set flags only when relevant field is valid + */ +enum ibv_exp_qp_init_attr_comp_mask { + IBV_EXP_QP_INIT_ATTR_PD = 1 << 0, + IBV_EXP_QP_INIT_ATTR_XRCD = 1 << 1, + IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS = 1 << 2, + IBV_EXP_QP_INIT_ATTR_INL_RECV = 1 << 3, + IBV_EXP_QP_INIT_ATTR_QPG = 1 << 4, + IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG = 1 << 5, + IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS = 1 << 6, + IBV_EXP_QP_INIT_ATTR_RESERVED = 1 << 7, /* This field is kept for backward compatibility + * of application which use the following to set comp_mask: + * qp_init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_RESERVED - 1 + * This kind of setting is no longer accepted and application + * may set only valid known fields, for example: + * qp_init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD | + * IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS + */ + IBV_EXP_QP_INIT_ATTR_RES_DOMAIN = 1 << 7, + IBV_EXP_QP_INIT_ATTR_RX_HASH = 1 << 8, + IBV_EXP_QP_INIT_ATTR_PORT = 1 << 9, + IBV_EXP_QP_INIT_ATTR_RESERVED1 = 1 << 10, +}; + +enum ibv_exp_qpg_type { + IBV_EXP_QPG_NONE = 0, + IBV_EXP_QPG_PARENT = (1<<0), + IBV_EXP_QPG_CHILD_RX = (1<<1), + IBV_EXP_QPG_CHILD_TX = (1<<2) +}; + +struct ibv_exp_qpg_init_attrib { + uint32_t tss_child_count; + uint32_t rss_child_count; +}; + +struct ibv_exp_qpg { + uint32_t qpg_type; + union { + struct ibv_qp *qpg_parent; /* see qpg_type */ + struct ibv_exp_qpg_init_attrib parent_attrib; + }; +}; + +/* + * RX Hash Function flags. +*/ +enum ibv_exp_rx_hash_function_flags { + IBV_EXP_RX_HASH_FUNC_TOEPLITZ = 1 << 0, + IBV_EXP_RX_HASH_FUNC_XOR = 1 << 1 +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + * Notice: *IPV4 and *IPV6 flags can't be enabled together on the same QP + * and *TCP and *UDP flags can't be enabled together on the same QP. +*/ +enum ibv_exp_rx_hash_fields { + IBV_EXP_RX_HASH_SRC_IPV4 = 1 << 0, + IBV_EXP_RX_HASH_DST_IPV4 = 1 << 1, + IBV_EXP_RX_HASH_SRC_IPV6 = 1 << 2, + IBV_EXP_RX_HASH_DST_IPV6 = 1 << 3, + IBV_EXP_RX_HASH_SRC_PORT_TCP = 1 << 4, + IBV_EXP_RX_HASH_DST_PORT_TCP = 1 << 5, + IBV_EXP_RX_HASH_SRC_PORT_UDP = 1 << 6, + IBV_EXP_RX_HASH_DST_PORT_UDP = 1 << 7 +}; + +/* + * RX Hash QP configuration. Sets hash function, hash types and + * Indirection table for QPs with enabled IBV_QP_INIT_ATTR_RX_HASH flag. +*/ +struct ibv_exp_rx_hash_conf { + /* enum ibv_exp_rx_hash_function_flags */ + uint8_t rx_hash_function; + /* valid only for Toeplitz */ + uint8_t rx_hash_key_len; + uint8_t *rx_hash_key; + /* enum ibv_exp_rx_hash_fields */ + uint64_t rx_hash_fields_mask; + struct ibv_exp_rwq_ind_table *rwq_ind_tbl; +}; + +/* + * Flags for exp_create_flags field in ibv_exp_qp_init_attr struct + */ +enum ibv_exp_qp_create_flags { + IBV_EXP_QP_CREATE_CROSS_CHANNEL = (1 << 2), + IBV_EXP_QP_CREATE_MANAGED_SEND = (1 << 3), + IBV_EXP_QP_CREATE_MANAGED_RECV = (1 << 4), + IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW = (1 << 6), + IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW = (1 << 7), + IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY = (1 << 8), + IBV_EXP_QP_CREATE_UMR = (1 << 9), + /* set supported bits for validity check */ + IBV_EXP_QP_CREATE_MASK = (0x000003DC) +}; + +struct ibv_exp_qp_init_attr { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; + + uint32_t comp_mask; /* use ibv_exp_qp_init_attr_comp_mask */ + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; + uint32_t exp_create_flags; /* use ibv_exp_qp_create_flags */ + + uint32_t max_inl_recv; + struct ibv_exp_qpg qpg; + uint32_t max_atomic_arg; + uint32_t max_inl_send_klms; + struct ibv_exp_res_domain *res_domain; + struct ibv_exp_rx_hash_conf *rx_hash_conf; + uint8_t port_num; +}; + +/* + * Flags for ibv_exp_dct_init_attr struct comp_mask + */ +enum ibv_exp_dct_init_attr_comp_mask { + IBV_EXP_DCT_INIT_ATTR_RESERVED = 1 << 0 +}; + +enum { + IBV_EXP_DCT_CREATE_FLAGS_MASK = (1 << 0) - 1, +}; + +struct ibv_exp_dct_init_attr { + struct ibv_pd *pd; + struct ibv_cq *cq; + struct ibv_srq *srq; + uint64_t dc_key; + uint8_t port; + uint32_t access_flags; /* use ibv_access_flags form verbs.h */ + uint8_t min_rnr_timer; + uint8_t tclass; + uint32_t flow_label; + enum ibv_mtu mtu; + uint8_t pkey_index; + uint8_t gid_index; + uint8_t hop_limit; + uint32_t inline_size; + uint32_t create_flags; + uint32_t comp_mask; /* reserved for future growth (must be 0) */ +}; + +enum { + IBV_EXP_DCT_STATE_ACTIVE = 0, + IBV_EXP_DCT_STATE_DRAINING = 1, + IBV_EXP_DCT_STATE_DRAINED = 2 +}; + +/* + * Flags for ibv_exp_dct_attr struct comp_mask + */ +enum ibv_exp_dct_attr_comp_mask { + IBV_EXP_DCT_ATTR_RESERVED = 1 << 0 +}; + +struct ibv_exp_dct_attr { + uint64_t dc_key; + uint8_t port; + uint32_t access_flags; /* use ibv_access_flags form verbs.h */ + uint8_t min_rnr_timer; + uint8_t tclass; + uint32_t flow_label; + enum ibv_mtu mtu; + uint8_t pkey_index; + uint8_t gid_index; + uint8_t hop_limit; + uint32_t key_violations; + uint8_t state; + struct ibv_srq *srq; + struct ibv_cq *cq; + struct ibv_pd *pd; + uint32_t comp_mask; /* reserved for future growth (must be 0) */ +}; + +enum { + IBV_EXP_QUERY_PORT_STATE = 1 << 0, + IBV_EXP_QUERY_PORT_MAX_MTU = 1 << 1, + IBV_EXP_QUERY_PORT_ACTIVE_MTU = 1 << 2, + IBV_EXP_QUERY_PORT_GID_TBL_LEN = 1 << 3, + IBV_EXP_QUERY_PORT_CAP_FLAGS = 1 << 4, + IBV_EXP_QUERY_PORT_MAX_MSG_SZ = 1 << 5, + IBV_EXP_QUERY_PORT_BAD_PKEY_CNTR = 1 << 6, + IBV_EXP_QUERY_PORT_QKEY_VIOL_CNTR = 1 << 7, + IBV_EXP_QUERY_PORT_PKEY_TBL_LEN = 1 << 8, + IBV_EXP_QUERY_PORT_LID = 1 << 9, + IBV_EXP_QUERY_PORT_SM_LID = 1 << 10, + IBV_EXP_QUERY_PORT_LMC = 1 << 11, + IBV_EXP_QUERY_PORT_MAX_VL_NUM = 1 << 12, + IBV_EXP_QUERY_PORT_SM_SL = 1 << 13, + IBV_EXP_QUERY_PORT_SUBNET_TIMEOUT = 1 << 14, + IBV_EXP_QUERY_PORT_INIT_TYPE_REPLY = 1 << 15, + IBV_EXP_QUERY_PORT_ACTIVE_WIDTH = 1 << 16, + IBV_EXP_QUERY_PORT_ACTIVE_SPEED = 1 << 17, + IBV_EXP_QUERY_PORT_PHYS_STATE = 1 << 18, + IBV_EXP_QUERY_PORT_LINK_LAYER = 1 << 19, + /* mask of the fields that exists in the standard query_port_command */ + IBV_EXP_QUERY_PORT_STD_MASK = (1 << 20) - 1, + /* mask of all supported fields */ + IBV_EXP_QUERY_PORT_MASK = IBV_EXP_QUERY_PORT_STD_MASK, +}; + +/* + * Flags for ibv_exp_port_attr struct comp_mask + * Set flags only when relevant field is valid + */ +enum ibv_exp_query_port_attr_comp_mask { + IBV_EXP_QUERY_PORT_ATTR_MASK1 = 1 << 0, + IBV_EXP_QUERY_PORT_ATTR_RESERVED = 1 << 1, + + IBV_EXP_QUERY_PORT_ATTR_MASKS = IBV_EXP_QUERY_PORT_ATTR_RESERVED - 1 +}; + +struct ibv_exp_port_attr { + union { + struct { + enum ibv_port_state state; + enum ibv_mtu max_mtu; + enum ibv_mtu active_mtu; + int gid_tbl_len; + uint32_t port_cap_flags; + uint32_t max_msg_sz; + uint32_t bad_pkey_cntr; + uint32_t qkey_viol_cntr; + uint16_t pkey_tbl_len; + uint16_t lid; + uint16_t sm_lid; + uint8_t lmc; + uint8_t max_vl_num; + uint8_t sm_sl; + uint8_t subnet_timeout; + uint8_t init_type_reply; + uint8_t active_width; + uint8_t active_speed; + uint8_t phys_state; + uint8_t link_layer; + uint8_t reserved; + }; + struct ibv_port_attr port_attr; + }; + uint32_t comp_mask; + uint32_t mask1; +}; + +enum ibv_exp_cq_attr_mask { + IBV_EXP_CQ_MODERATION = 1 << 0, + IBV_EXP_CQ_CAP_FLAGS = 1 << 1 +}; + +enum ibv_exp_cq_cap_flags { + IBV_EXP_CQ_IGNORE_OVERRUN = (1 << 0), + /* set supported bits for validity check */ + IBV_EXP_CQ_CAP_MASK = (0x00000001) +}; + +/* + * Flags for ibv_exp_cq_attr struct comp_mask + * Set flags only when relevant field is valid + */ +enum ibv_exp_cq_attr_comp_mask { + IBV_EXP_CQ_ATTR_MODERATION = (1 << 0), + IBV_EXP_CQ_ATTR_CQ_CAP_FLAGS = (1 << 1), + /* set supported bits for validity check */ + IBV_EXP_CQ_ATTR_RESERVED = (1 << 2) +}; + +struct ibv_exp_cq_attr { + uint32_t comp_mask; + struct { + uint16_t cq_count; + uint16_t cq_period; + } moderation; + uint32_t cq_cap_flags; +}; + +enum ibv_exp_rereg_mr_flags { + IBV_EXP_REREG_MR_CHANGE_TRANSLATION = IBV_REREG_MR_CHANGE_TRANSLATION, + IBV_EXP_REREG_MR_CHANGE_PD = IBV_REREG_MR_CHANGE_PD, + IBV_EXP_REREG_MR_CHANGE_ACCESS = IBV_REREG_MR_CHANGE_ACCESS, + IBV_EXP_REREG_MR_KEEP_VALID = IBV_REREG_MR_KEEP_VALID, + IBV_EXP_REREG_MR_FLAGS_SUPPORTED = ((IBV_EXP_REREG_MR_KEEP_VALID << 1) - 1) +}; + +enum ibv_exp_rereg_mr_attr_mask { + IBV_EXP_REREG_MR_ATTR_RESERVED = (1 << 0) +}; + +struct ibv_exp_rereg_mr_attr { + uint32_t comp_mask; /* use ibv_exp_rereg_mr_attr_mask */ +}; + +/* + * Flags for ibv_exp_reg_shared_mr_in struct comp_mask + */ +enum ibv_exp_reg_shared_mr_comp_mask { + IBV_EXP_REG_SHARED_MR_RESERVED = (1 << 0) +}; + +struct ibv_exp_reg_shared_mr_in { + uint32_t mr_handle; + struct ibv_pd *pd; + void *addr; + uint64_t exp_access; /* use ibv_exp_access_flags */ + uint32_t comp_mask; /* reserved for future growth (must be 0) */ +}; + +enum ibv_exp_flow_flags { + IBV_EXP_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, +}; + +enum ibv_exp_flow_attr_type { + /* steering according to rule specifications */ + IBV_EXP_FLOW_ATTR_NORMAL = 0x0, + /* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ + IBV_EXP_FLOW_ATTR_ALL_DEFAULT = 0x1, + /* default multicast rule - + * receive all Eth multicast traffic which isn't steered to any QP + */ + IBV_EXP_FLOW_ATTR_MC_DEFAULT = 0x2, + /* sniffer rule - receive all port traffic */ + IBV_EXP_FLOW_ATTR_SNIFFER = 0x3, +}; + +enum ibv_exp_flow_spec_type { + IBV_EXP_FLOW_SPEC_ETH = 0x20, + IBV_EXP_FLOW_SPEC_IB = 0x21, + IBV_EXP_FLOW_SPEC_IPV4 = 0x30, + IBV_EXP_FLOW_SPEC_IPV6 = 0x31, + IBV_EXP_FLOW_SPEC_TCP = 0x40, + IBV_EXP_FLOW_SPEC_UDP = 0x41, +}; + +struct ibv_exp_flow_eth_filter { + uint8_t dst_mac[6]; + uint8_t src_mac[6]; + uint16_t ether_type; + /* + * same layout as 802.1q: prio 3, cfi 1, vlan id 12 + */ + uint16_t vlan_tag; +}; + +struct ibv_exp_flow_spec_eth { + enum ibv_exp_flow_spec_type type; + uint16_t size; + struct ibv_exp_flow_eth_filter val; + struct ibv_exp_flow_eth_filter mask; +}; + +struct ibv_exp_flow_ib_filter { + uint32_t qpn; + uint8_t dst_gid[16]; +}; + +struct ibv_exp_flow_spec_ib { + enum ibv_exp_flow_spec_type type; + uint16_t size; + struct ibv_exp_flow_ib_filter val; + struct ibv_exp_flow_ib_filter mask; +}; + +struct ibv_exp_flow_ipv4_filter { + uint32_t src_ip; + uint32_t dst_ip; +}; + +struct ibv_exp_flow_spec_ipv4 { + enum ibv_exp_flow_spec_type type; + uint16_t size; + struct ibv_exp_flow_ipv4_filter val; + struct ibv_exp_flow_ipv4_filter mask; +}; + +struct ibv_exp_flow_ipv6_filter { + uint8_t src_ip[16]; + uint8_t dst_ip[16]; +}; + +struct ibv_exp_flow_spec_ipv6 { + enum ibv_exp_flow_spec_type type; + uint16_t size; + struct ibv_exp_flow_ipv6_filter val; + struct ibv_exp_flow_ipv6_filter mask; +}; + +struct ibv_exp_flow_tcp_udp_filter { + uint16_t dst_port; + uint16_t src_port; +}; + +struct ibv_exp_flow_spec_tcp_udp { + enum ibv_exp_flow_spec_type type; + uint16_t size; + struct ibv_exp_flow_tcp_udp_filter val; + struct ibv_exp_flow_tcp_udp_filter mask; +}; + +struct ibv_exp_flow_spec { + union { + struct { + enum ibv_exp_flow_spec_type type; + uint16_t size; + } hdr; + struct ibv_exp_flow_spec_ib ib; + struct ibv_exp_flow_spec_eth eth; + struct ibv_exp_flow_spec_ipv4 ipv4; + struct ibv_exp_flow_spec_tcp_udp tcp_udp; + struct ibv_exp_flow_spec_ipv6 ipv6; + }; +}; + +struct ibv_exp_flow_attr { + enum ibv_exp_flow_attr_type type; + uint16_t size; + uint16_t priority; + uint8_t num_of_specs; + uint8_t port; + uint32_t flags; + /* Following are the optional layers according to user request + * struct ibv_exp_flow_spec_xxx [L2] + * struct ibv_exp_flow_spec_yyy [L3/L4] + */ + uint64_t reserved; /* reserved for future growth (must be 0) */ +}; + +struct ibv_exp_flow { + struct ibv_context *context; + uint32_t handle; +}; + +struct ibv_exp_dct { + struct ibv_context *context; + uint32_t handle; + uint32_t dct_num; + struct ibv_pd *pd; + struct ibv_srq *srq; + struct ibv_cq *cq; + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; +}; + +enum ibv_exp_wc_opcode { + IBV_EXP_WC_SEND, + IBV_EXP_WC_RDMA_WRITE, + IBV_EXP_WC_RDMA_READ, + IBV_EXP_WC_COMP_SWAP, + IBV_EXP_WC_FETCH_ADD, + IBV_EXP_WC_BIND_MW, + IBV_EXP_WC_LOCAL_INV = 7, + IBV_EXP_WC_MASKED_COMP_SWAP = 9, + IBV_EXP_WC_MASKED_FETCH_ADD = 10, + IBV_EXP_WC_UMR = 0x100, +/* + * Set value of IBV_EXP_WC_RECV so consumers can test if a completion is a + * receive by testing (opcode & IBV_EXP_WC_RECV). + */ + IBV_EXP_WC_RECV = 1 << 7, + IBV_EXP_WC_RECV_RDMA_WITH_IMM +}; + +enum ibv_exp_wc_flags { + IBV_EXP_WC_GRH = IBV_WC_GRH, + IBV_EXP_WC_WITH_IMM = IBV_WC_WITH_IMM, + + IBV_EXP_WC_WITH_INV = IBV_EXP_START_FLAG << 2, + IBV_EXP_WC_WITH_SL = IBV_EXP_START_FLAG << 4, + IBV_EXP_WC_WITH_SLID = IBV_EXP_START_FLAG << 5, + IBV_EXP_WC_WITH_TIMESTAMP = IBV_EXP_START_FLAG << 6, + IBV_EXP_WC_QP = IBV_EXP_START_FLAG << 7, + IBV_EXP_WC_SRQ = IBV_EXP_START_FLAG << 8, + IBV_EXP_WC_DCT = IBV_EXP_START_FLAG << 9, + IBV_EXP_WC_RX_IP_CSUM_OK = IBV_EXP_START_FLAG << 10, + IBV_EXP_WC_RX_TCP_UDP_CSUM_OK = IBV_EXP_START_FLAG << 11, + IBV_EXP_WC_RX_IPV4_PACKET = IBV_EXP_START_FLAG << 12, + IBV_EXP_WC_RX_IPV6_PACKET = IBV_EXP_START_FLAG << 13, + IBV_EXP_WC_RX_TUNNEL_PACKET = IBV_EXP_START_FLAG << 14, + IBV_EXP_WC_RX_OUTER_IP_CSUM_OK = IBV_EXP_START_FLAG << 15, + IBV_EXP_WC_RX_OUTER_TCP_UDP_CSUM_OK = IBV_EXP_START_FLAG << 16, + IBV_EXP_WC_RX_OUTER_IPV4_PACKET = IBV_EXP_START_FLAG << 17, + IBV_EXP_WC_RX_OUTER_IPV6_PACKET = IBV_EXP_START_FLAG << 18, +}; + +struct ibv_exp_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_exp_wc_opcode exp_opcode; + uint32_t vendor_err; + uint32_t byte_len; + uint32_t imm_data; /* in network byte order */ + uint32_t qp_num; + uint32_t src_qp; + int reserved; /* place holder to align with ibv_wc */ + uint16_t pkey_index; + uint16_t slid; /* invalid when TS is used */ + uint8_t sl; /* invalid when TS is used */ + uint8_t dlid_path_bits; + uint64_t timestamp; + struct ibv_qp *qp; + struct ibv_srq *srq; + struct ibv_exp_dct *dct; + uint64_t exp_wc_flags; /* use ibv_exp_wc_flags */ +}; + +/* + * Flags for ibv_exp_prefetch_mr comp_mask + */ +enum ibv_exp_prefetch_attr_comp_mask { + IBV_EXP_PREFETCH_MR_RESERVED = (1 << 0), +}; + +/* + * Flags for ibv_exp_prefetch_mr flags + */ +enum ibv_exp_prefetch_attr_flags { + /* request prefetching for write access. Used for both local and remote */ + IBV_EXP_PREFETCH_WRITE_ACCESS = (1 << 0), +}; + +struct ibv_exp_prefetch_attr { + /* Use enum ibv_exp_prefetch_attr_flags */ + uint32_t flags; + /* Address of the area to prefetch */ + void *addr; + /* Length of the area to prefetch */ + size_t length; + uint32_t comp_mask; +}; + +/* + * Flags for ibv_exp_reg_mr_in struct comp_mask + */ +enum ibv_exp_reg_mr_in_comp_mask { + /* set supported bits for validity check */ + IBV_EXP_REG_MR_CREATE_FLAGS = (1 << 0), + IBV_EXP_REG_MR_RESERVED = (1 << 1) +}; + +enum ibv_exp_reg_mr_create_flags { + IBV_EXP_REG_MR_CREATE_CONTIG = (1 << 0) /* register mr with contiguous pages */ +}; + +struct ibv_exp_reg_mr_in { + struct ibv_pd *pd; + void *addr; + size_t length; + uint64_t exp_access; /* use ibv_exp_access_flags */ + uint32_t comp_mask; /* reserved for future growth (must be 0) */ + uint32_t create_flags; /* use ibv_exp_reg_mr_create_flags */ +}; + + +enum ibv_exp_task_type { + IBV_EXP_TASK_SEND = 0, + IBV_EXP_TASK_RECV = 1 +}; + +/* + * Flags for ibv_exp_task struct comp_mask + */ +enum ibv_exp_task_comp_mask { + IBV_EXP_TASK_RESERVED = (1 << 0) +}; + +struct ibv_exp_task { + enum ibv_exp_task_type task_type; + struct { + struct ibv_qp *qp; + union { + struct ibv_exp_send_wr *send_wr; + struct ibv_recv_wr *recv_wr; + }; + } item; + struct ibv_exp_task *next; + uint32_t comp_mask; /* reserved for future growth (must be 0) */ +}; + +/* + * Flags for ibv_exp_arm_attr struct comp_mask + */ +enum ibv_exp_arm_attr_comp_mask { + IBV_EXP_ARM_ATTR_RESERVED = (1 << 0) +}; +struct ibv_exp_arm_attr { + uint32_t comp_mask; /* reserved for future growth (must be 0) */ +}; + +enum ibv_exp_mr_create_flags { + IBV_EXP_MR_SIGNATURE_EN = (1 << 0), + IBV_EXP_MR_INDIRECT_KLMS = (1 << 1) +}; + +struct ibv_exp_mr_init_attr { + uint32_t max_klm_list_size; /* num of entries */ + uint32_t create_flags; /* use ibv_exp_mr_create_flags */ + uint64_t exp_access_flags; /* use ibv_exp_access_flags */ +}; + +/* + * Comp_mask for ibv_exp_create_mr_in struct comp_mask + */ +enum ibv_exp_create_mr_in_comp_mask { + IBV_EXP_CREATE_MR_IN_RESERVED = (1 << 0) +}; + +struct ibv_exp_create_mr_in { + struct ibv_pd *pd; + struct ibv_exp_mr_init_attr attr; + uint32_t comp_mask; /* use ibv_exp_create_mr_in_comp_mask */ +}; + +/* + * Flags for ibv_exp_mkey_attr struct comp_mask + */ +enum ibv_exp_mkey_attr_comp_mask { + IBV_EXP_MKEY_ATTR_RESERVED = (1 << 0) +}; + +struct ibv_exp_mkey_attr { + uint32_t max_klm_list_size; + uint32_t comp_mask; /* use ibv_exp_mkey_attr_comp_mask */ +}; + +struct ibv_exp_mkey_list_container { + uint32_t max_klm_list_size; + struct ibv_context *context; +}; + +enum ibv_exp_mkey_list_type { + IBV_EXP_MKEY_LIST_TYPE_INDIRECT_MR +}; + +/* + * Flags for ibv_exp_mkey_list_container_attr struct comp_mask + */ +enum ibv_exp_alloc_mkey_list_comp_mask { + IBV_EXP_MKEY_LIST_CONTAINER_RESERVED = (1 << 0) +}; + +struct ibv_exp_mkey_list_container_attr { + struct ibv_pd *pd; + uint32_t mkey_list_type; /* use ibv_exp_mkey_list_type */ + uint32_t max_klm_list_size; + uint32_t comp_mask; /*use ibv_exp_alloc_mkey_list_comp_mask */ +}; + +/* + * Flags for ibv_exp_rereg_out struct comp_mask + */ +enum ibv_exp_rereg_mr_comp_mask { + IBV_EXP_REREG_MR_RESERVED = (1 << 0) +}; + +struct ibv_exp_rereg_out { + int need_dofork; + uint32_t comp_mask; /* use ibv_exp_rereg_mr_comp_mask */ +}; + +/* + * Flags for ibv_exp_dereg_out struct comp_mask + */ +enum ibv_exp_dereg_mr_comp_mask { + IBV_EXP_DEREG_MR_RESERVED = (1 << 0) +}; + +struct ibv_exp_dereg_out { + int need_dofork; + uint32_t comp_mask; /* use ibv_exp_dereg_mr_comp_mask */ +}; + +struct verbs_env_item { + char *name; + char *value; + struct verbs_env_item *next; +}; + +struct verbs_environment { + struct verbs_env_item *head; + pthread_mutex_t mtx; +}; + +/* RSS stuff */ + +enum ibv_exp_wq_type { + IBV_EXP_WQT_RQ, + IBV_EXP_WQT_SRQ +}; + +enum ibv_exp_wq_state { + IBV_EXP_WQS_RESET, + IBV_EXP_WQS_RDY, + IBV_EXP_WQS_ERR, + IBV_EXP_WQS_UNKNOWN +}; + +/* + * Work Queue. QP can be created without internal WQs "packaged" inside it, + * this QPs can be configured to use "external" WQ object as its + * receive/send queue. + * WQ associated (many to one) with Completion Queue it owns WQ properties + * (PD, WQ size etc). + * WQ of type IBV_EXP_WQT_RQ contains receive WQEs, in which case its PD serves + * scatter as well. + * WQ of type IBV_EXP_WQT_SRQ is associated (many to one) with regular ibv_srq, + * in which case it does not hold receive WQEs. + * QPs can be associated with IBV_EXP_WQT_S/RQ WQs via WQ Indirection Table. + */ +struct ibv_exp_wq { + struct ibv_context *context; + void *wq_context; /* Associated Context of the WQ */ + uint32_t handle; + /* Protection domain WQ should be associated with */ + struct ibv_pd *pd; + /* CQ to be associated with the WQ */ + struct ibv_cq *cq; + /* SRQ handle if WQ is to be associated with an SRQ, otherwise NULL */ + struct ibv_srq *srq; + uint32_t wq_num; + enum ibv_exp_wq_state state; + enum ibv_exp_wq_type wq_type; + uint32_t comp_mask; +}; + +enum ibv_exp_wq_init_attr_mask { + IBV_EXP_CREATE_WQ_RES_DOMAIN = (1 << 0), + IBV_EXP_CREATE_WQ_MP_RQ = (1 << 1), + IBV_EXP_CREATE_WQ_RESERVED = (1 << 2) +}; + +struct ibv_exp_wq_mp_rq { + enum ibv_exp_mp_rq_shifts use_shift; + uint8_t single_wqe_log_num_of_strides; + uint8_t single_stride_log_num_of_bytes; +}; + +struct ibv_exp_wq_init_attr { + /* Associated Context of the WQ */ + void *wq_context; + enum ibv_exp_wq_type wq_type; + /* Valid for non IBV_EXP_WQT_SRQ WQ */ + uint32_t max_recv_wr; + /* Valid for non IBV_EXP_WQT_SRQ WQ */ + uint32_t max_recv_sge; + /* Protection domain WQ should be associated with */ + struct ibv_pd *pd; + /* CQ to be associated with the WQ */ + struct ibv_cq *cq; + /* SRQ handle if WQ is of type IBV_EXP_WQT_SRQ, otherwise NULL */ + struct ibv_srq *srq; + /* refers to ibv_exp_wq_init_attr_mask */ + uint32_t comp_mask; + struct ibv_exp_res_domain *res_domain; + struct ibv_exp_wq_mp_rq mp_rq; +}; + +enum ibv_exp_wq_attr_mask { + IBV_EXP_WQ_ATTR_STATE = 1 << 0, + IBV_EXP_WQ_ATTR_CURR_STATE = 1 << 1, + IBV_EXP_WQ_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_exp_wq_attr { + /* enum ibv_exp_wq_attr_mask */ + uint32_t attr_mask; + /* Move the RQ to this state */ + enum ibv_exp_wq_state wq_state; + /* Assume this is the current RQ state */ + enum ibv_exp_wq_state curr_wq_state; +}; + +/* + * Receive Work Queue Indirection Table. + * It's used in order to distribute incoming packets between different + * Receive Work Queues. Associating Receive WQs with different CPU cores + * allows to workload the traffic between different CPU cores. + * The Indirection Table can contain only WQs of type IBV_EXP_WQT_S/RQ. +*/ +struct ibv_exp_rwq_ind_table { + struct ibv_context *context; + struct ibv_pd *pd; + int ind_tbl_handle; + int ind_tbl_num; + uint32_t comp_mask; +}; + +enum ibv_exp_ind_table_init_attr_mask { + IBV_EXP_CREATE_IND_TABLE_RESERVED = (1 << 0) +}; + +/* + * Receive Work Queue Indirection Table attributes +*/ +struct ibv_exp_rwq_ind_table_init_attr { + struct ibv_pd *pd; + /* Log, base 2, of Indirection table size */ + uint32_t log_ind_tbl_size; + /* Each entry is a pointer to Receive Work Queue */ + struct ibv_exp_wq **ind_tbl; + uint32_t comp_mask; +}; + +/* Accelerated verbs */ +enum ibv_exp_thread_model { + IBV_EXP_THREAD_SAFE, /* The lib responsible to protect the object in multithreaded environment */ + IBV_EXP_THREAD_UNSAFE, /* The application responsible to protect the object in multithreaded environment */ + IBV_EXP_THREAD_SINGLE /* The object is called from only one thread */ +}; + +enum ibv_exp_msg_model { + IBV_EXP_MSG_DEFAULT, /* Use the provider default message model */ + IBV_EXP_MSG_LOW_LATENCY, /* Hint the provider to optimize for low latency */ + IBV_EXP_MSG_HIGH_BW, /* Hint the provider to optimize for high bandwidth */ + IBV_EXP_MSG_FORCE_LOW_LATENCY, /* Force the provider to optimize for low latency */ +}; + +/* + * Resource domains + */ +enum ibv_exp_res_domain_init_attr_comp_mask { + IBV_EXP_RES_DOMAIN_THREAD_MODEL = (1 << 0), + IBV_EXP_RES_DOMAIN_MSG_MODEL = (1 << 1), + IBV_EXP_RES_DOMAIN_RESERVED = (1 << 2), +}; + +struct ibv_exp_res_domain_init_attr { + uint32_t comp_mask; /* use ibv_exp_res_domain_init_attr_comp_mask */ + enum ibv_exp_thread_model thread_model; + enum ibv_exp_msg_model msg_model; +}; + +enum ibv_exp_destroy_res_domain_comp_mask { + IBV_EXP_DESTROY_RES_DOMAIN_RESERVED = (1 << 0), +}; + +struct ibv_exp_destroy_res_domain_attr { + uint32_t comp_mask; /* use ibv_exp_destroy_res_domain_comp_mask */ +}; + +/* + * Query interface (specialized Verbs) + */ + +enum ibv_exp_query_intf_flags { + /* Interface functions includes correctness and validity checks */ + IBV_EXP_QUERY_INTF_FLAG_ENABLE_CHECKS = (1 << 0), +}; + +enum ibv_exp_intf_family { + IBV_EXP_INTF_QP_BURST, + IBV_EXP_INTF_CQ, + IBV_EXP_INTF_WQ, + IBV_EXP_INTF_RESERVED, +}; + +enum ibv_exp_experimental_intf_family { + IBV_EXP_EXPERIMENTAL_INTF_RESERVED, +}; + +enum ibv_exp_intf_scope { + IBV_EXP_INTF_GLOBAL, /* Permanent interface, identified by + * the ibv_exp_intf_family enum + */ + IBV_EXP_INTF_EXPERIMENTAL, /* Interface under evaluation, identified by + * the ibv_exp_experimental_intf_family enum + * This interface may change between lib + * versions + */ + IBV_EXP_INTF_VENDOR, /* Vendor specific interface, defined in vendor + * separate header file + */ + IBV_EXP_INTF_VENDOR_EXPERIMENTAL, /* Vendor interface under evaluation, + * defined in vendor separate header + * file + */ +}; + +/* Return status from ibv_exp_query_intf */ +enum ibv_exp_query_intf_status { + IBV_EXP_INTF_STAT_OK, + IBV_EXP_INTF_STAT_VENDOR_NOT_SUPPORTED, /* The provided 'vendor_guid' is not supported */ + IBV_EXP_INTF_STAT_INTF_NOT_SUPPORTED, /* The provided 'intf' is not supported */ + IBV_EXP_INTF_STAT_VERSION_NOT_SUPPORTED, /* The provided 'intf_version' is not supported */ + IBV_EXP_INTF_STAT_INVAL_PARARM, /* General invalid parameter */ + IBV_EXP_INTF_STAT_INVAL_OBJ_STATE, /* QP is not in INIT, RTR or RTS state */ + IBV_EXP_INTF_STAT_INVAL_OBJ, /* Mismatch between the provided 'obj'(CQ/QP/WQ) and requested 'intf' */ + IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED, /* The provided set of 'flags' is not supported */ + IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED, /* The provided set of 'family_flags' is not supported */ +}; + +enum ibv_exp_query_intf_comp_mask { + IBV_EXP_QUERY_INTF_RESERVED = (1 << 0), +}; + +struct ibv_exp_query_intf_params { + uint32_t flags; /* use ibv_exp_query_intf_flags */ + enum ibv_exp_intf_scope intf_scope; + uint64_t vendor_guid; /* set in case VENDOR intf_scope selected */ + uint32_t intf; /* for GLOBAL intf_scope use ibv_exp_intf_family enum */ + uint32_t intf_version; /* Version */ + void *obj; /* interface object (CQ/QP/WQ) */ + void *family_params; /* Family-specific params */ + uint32_t family_flags; /* Family-specific flags */ + uint32_t comp_mask; /* use ibv_exp_query_intf_comp_mask */ +}; + +enum ibv_exp_release_intf_comp_mask { + IBV_EXP_RELEASE_INTF_RESERVED = (1 << 0), +}; + +struct ibv_exp_release_intf_params { + uint32_t comp_mask; /* use ibv_exp_release_intf_comp_mask */ +}; + +/* + * Family interfaces + */ + +/* QP burst family */ + +/* Flags to use in family_flags field of ibv_exp_query_intf_params on family creation */ +enum ibv_exp_qp_burst_family_create_flags { + /* To disable loop-back of multi-cast messages in RAW-ETH */ + IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK = (1 << 0), + /* To enable Multi-Packet send WR when possible */ + IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR = (1 << 1), +}; + +/* Flags to use on send functions of QP burst family */ +enum ibv_exp_qp_burst_family_flags { + IBV_EXP_QP_BURST_SIGNALED = 1 << 0, + IBV_EXP_QP_BURST_SOLICITED = 1 << 1, + IBV_EXP_QP_BURST_IP_CSUM = 1 << 2, + IBV_EXP_QP_BURST_TUNNEL = 1 << 3, + IBV_EXP_QP_BURST_FENCE = 1 << 4, +}; + +struct ibv_exp_qp_burst_family { + /* + * send_pending - Put one message in the provider send queue. + * + * Common usage: After calling several times to send_pending + * the application need to call send_flush to ensure the send + * of the pending messages. + * Note: Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_pending)(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags); + /* + * send_pending_inline - Put one inline message in the provider send queue. + * + * Common usage: Same as send_pending + * Notes: + * - The message length must fit the max inline size of the QP. + * Providing bigger messages may lead to data corruption and + * segmentation fault. + * - Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_pending_inline)(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags); + /* + * send_pending_sg_list - Put one scatter-gather(sg) message in the provider send queue. + * + * Common usage: Same as send_pending + * Notes: + * - The number of sg entries must fit the max_send_sge of the QP. + * Providing bigger list of sg entries may lead to data corruption and + * segmentation fault. + * - Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_pending_sg_list)(struct ibv_qp *qp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags); + /* + * send_flush - To flush the pending messages. + * + * Note: Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_flush)(struct ibv_qp *qp); + /* + * send_burst - Send a list of 'num' messages (no send_flush required in this case) + */ + int (*send_burst)(struct ibv_qp *qp, struct ibv_sge *msg_list, uint32_t num, uint32_t flags); + /* + * recv_burst - Post a set of 'num' receive buffers. + * + * Note: One sge per message is supported by this function + */ + int (*recv_burst)(struct ibv_qp *qp, struct ibv_sge *msg_list, uint32_t num); +}; + +/* WQ family */ +struct ibv_exp_wq_family { + /* + * recv_sg_list - Post one scatter-gather(sg) receive buffer. + * + * Note: + * - The number of sg entries must fit the max_recv_sge of the WQ. + * Providing bigger list of sg entries may lead to data corruption and + * segmentation fault. + */ + int (*recv_sg_list)(struct ibv_exp_wq *wq, struct ibv_sge *sg_list, uint32_t num_sg); + /* + * recv_burst - Post a set of 'num' receive buffers. + * + * Note: One sge per message is supported by this function + */ + int (*recv_burst)(struct ibv_exp_wq *wq, struct ibv_sge *msg_list, uint32_t num); +}; + +/* CQ family */ +enum ibv_exp_cq_family_flags { + /* RX offloads flags */ + /* The cq_family_flags are applicable + * according to the existence of the + * related device capabilities flags */ + IBV_EXP_CQ_RX_IP_CSUM_OK = 1 << 0, /* IBV_EXP_DEVICE_RX_CSUM_IP_PKT or IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK = 1 << 1, /* IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_IPV4_PACKET = 1 << 2, /* IBV_EXP_DEVICE_RX_CSUM_IP_PKT or IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_IPV6_PACKET = 1 << 3, /* IBV_EXP_DEVICE_RX_CSUM_IP_PKT or IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_TUNNEL_PACKET = 1 << 4, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK = 1 << 5, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK = 1 << 6, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_IPV4_PACKET = 1 << 7, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_IPV6_PACKET = 1 << 8, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + + /* Flags supported from CQ family version 1 */ + /* Multi-Packet RQ flag */ + IBV_EXP_CQ_RX_MULTI_PACKET_LAST_V1 = 1 << 9, /* Last packet on WR */ +}; + +/* All functions of CQ family included in CQ family version 1 */ +struct ibv_exp_cq_family { + int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max); + int32_t (*poll_length)(struct ibv_cq *cq, void *buf, uint32_t *inl); + int32_t (*poll_length_flags)(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags); +}; + +struct ibv_exp_cq_family_v1 { + /* + * poll_cnt - Poll up to 'max' valid completions + * + * The function returns the number of valid completions it + * managed to drain from the CQ. + * + * Usage example: In case a CQ is connected to one send-queue + * the application may use this function to get + * the number of the QP send-completions. + * + * Return value (n): + * n >= 0 : number extracted completions. + * n == -1 : operation failed. completion is not extracted. + * To extract this completion, ibv_poll_cq() must be used + * + * Note: The function designed to support TX completion, it may also be + * used for RX completion but it is not supporting RX inline-scatter. + */ + int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max); + /* + * poll_length - Poll one receive completion and provide the related + * message length. + * + * The function returns only the length of the completed message. + * In case of inline received message the message will be copied + * to the provided buffer ('buf') and the '*inl' status will be set. + * The function extracts only completion of regular receive-messages. + * In case of send-message completion or SRQ receive-message completion + * it returns -1. + * + * Usage example: In case a CQ is connected to one receive-queue + * the application may use this function to get + * the size of the next received message. + * + * Return value (n): + * n > 0 : successful completion with positive length. + * *inl will be set to 1 if data was copied to buffer. + * + * 0 : Empty. + * n == -1 : operation failed. completion is not extracted. + * To extract this completion, ibv_poll_cq() must be used + */ + int32_t (*poll_length)(struct ibv_cq *cq, void *buf, uint32_t *inl); + /* + * poll_length_flags - Poll one receive completion and provide the related + * message length and completion flags. + * + * The same as poll_length but also retrieves completion flags as + * defined by the enum ibv_exp_cq_family_flags + */ + int32_t (*poll_length_flags)(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags); + /* + * poll_length_flags_mp_rq - Poll one receive completion and provide the related + * message length, packet-offset and completion flags. + * + * The same as poll_length_flags but: + * - Without the inline-receive support. + * - Also retrieves offset in the WR posted buffer as defined by the WR SG list. + * The start of the received packet is located in this offset. + */ + int32_t (*poll_length_flags_mp_rq)(struct ibv_cq *cq, uint32_t *offset, uint32_t *flags); +}; + +enum { + IBV_EXP_NUM_DC_INFO_LIDS = 30 +}; + +struct ibv_exp_dc_info_ent { + uint16_t lid[IBV_EXP_NUM_DC_INFO_LIDS]; + uint32_t seqnum; +}; + +enum ibv_exp_roce_gid_type { + IBV_EXP_IB_ROCE_V1_GID_TYPE, + IBV_EXP_ROCE_V2_GID_TYPE, + IBV_EXP_ROCE_V1_5_GID_TYPE, +}; + +enum ibv_exp_query_gid_attr { + IBV_EXP_QUERY_GID_ATTR_TYPE = (1 << 0), + IBV_EXP_QUERY_GID_ATTR_GID = (1 << 1), + IBV_EXP_QUERY_GID_ATTR_RESERVED = (1 << 2), +}; + +struct ibv_exp_gid_attr { + uint32_t comp_mask; + enum ibv_exp_roce_gid_type type; + union ibv_gid gid; +}; + +struct verbs_context_exp { + /* "grows up" - new fields go here */ + int (*exp_query_gid_attr)(struct ibv_context *context, uint8_t port_num, + unsigned int index, + struct ibv_exp_gid_attr *attr); + int (*exp_destroy_rwq_ind_table)(struct ibv_exp_rwq_ind_table *rwq_ind_table); + struct ibv_exp_rwq_ind_table *(*exp_create_rwq_ind_table)(struct ibv_context *context, + struct ibv_exp_rwq_ind_table_init_attr *init_attr); + int (*exp_destroy_wq)(struct ibv_exp_wq *wq); + int (*exp_modify_wq)(struct ibv_exp_wq *wq, + struct ibv_exp_wq_attr *wq_attr); + struct ibv_exp_wq * (*exp_create_wq)(struct ibv_context *context, + struct ibv_exp_wq_init_attr *wq_init_attr); + int (*drv_exp_poll_dc_info)(struct ibv_context *context, + struct ibv_exp_dc_info_ent *ents, + int nent, int port); + void *(*exp_query_intf)(struct ibv_context *context, struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status); + int (*exp_release_intf)(struct ibv_context *context, void *intf, + struct ibv_exp_release_intf_params *params); + struct ibv_exp_res_domain *(*exp_create_res_domain)(struct ibv_context *context, + struct ibv_exp_res_domain_init_attr *attr); + int (*exp_destroy_res_domain)(struct ibv_context *context, + struct ibv_exp_res_domain *res_dom, + struct ibv_exp_destroy_res_domain_attr *attr); + int (*lib_exp_use_priv_env)(struct ibv_context *context); + int (*lib_exp_setenv)(struct ibv_context *context, const char *name, + const char *value, int overwrite); + struct verbs_environment *venv; + int (*drv_exp_dereg_mr)(struct ibv_mr *mr, struct ibv_exp_dereg_out *out); + int (*exp_rereg_mr)(struct ibv_mr *mr, int flags, struct ibv_pd *pd, + void *addr, size_t length, uint64_t access, + struct ibv_exp_rereg_mr_attr *attr); + int (*drv_exp_rereg_mr)(struct ibv_mr *mr, int flags, struct ibv_pd *pd, + void *addr, size_t length, uint64_t access, + struct ibv_exp_rereg_mr_attr *attr, struct ibv_exp_rereg_out *out); + int (*drv_exp_prefetch_mr)(struct ibv_mr *mr, + struct ibv_exp_prefetch_attr *attr); + int (*lib_exp_prefetch_mr)(struct ibv_mr *mr, + struct ibv_exp_prefetch_attr *attr); + struct ibv_exp_mkey_list_container * (*drv_exp_alloc_mkey_list_memory)(struct ibv_exp_mkey_list_container_attr *attr); + struct ibv_exp_mkey_list_container * (*lib_exp_alloc_mkey_list_memory)(struct ibv_exp_mkey_list_container_attr *attr); + int (*drv_exp_dealloc_mkey_list_memory)(struct ibv_exp_mkey_list_container *mem); + int (*lib_exp_dealloc_mkey_list_memory)(struct ibv_exp_mkey_list_container *mem); + int (*drv_exp_query_mkey)(struct ibv_mr *mr, struct ibv_exp_mkey_attr *mkey_attr); + int (*lib_exp_query_mkey)(struct ibv_mr *mr, struct ibv_exp_mkey_attr *mkey_attr); + struct ibv_mr * (*drv_exp_create_mr)(struct ibv_exp_create_mr_in *in); + struct ibv_mr * (*lib_exp_create_mr)(struct ibv_exp_create_mr_in *in); + int (*drv_exp_arm_dct)(struct ibv_exp_dct *dct, struct ibv_exp_arm_attr *attr); + int (*lib_exp_arm_dct)(struct ibv_exp_dct *dct, struct ibv_exp_arm_attr *attr); + int (*drv_exp_bind_mw)(struct ibv_exp_mw_bind *mw_bind); + int (*lib_exp_bind_mw)(struct ibv_exp_mw_bind *mw_bind); + int (*drv_exp_post_send)(struct ibv_qp *qp, + struct ibv_exp_send_wr *wr, + struct ibv_exp_send_wr **bad_wr); + struct ibv_mr * (*drv_exp_reg_mr)(struct ibv_exp_reg_mr_in *in); + struct ibv_mr * (*lib_exp_reg_mr)(struct ibv_exp_reg_mr_in *in); + struct ibv_ah * (*drv_exp_ibv_create_ah)(struct ibv_pd *pd, + struct ibv_exp_ah_attr *attr_exp); + int (*drv_exp_query_values)(struct ibv_context *context, int q_values, + struct ibv_exp_values *values); + struct ibv_cq * (*exp_create_cq)(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector, struct ibv_exp_cq_init_attr *attr); + int (*drv_exp_ibv_poll_cq)(struct ibv_cq *ibcq, int num_entries, + struct ibv_exp_wc *wc, uint32_t wc_size); + void * (*drv_exp_get_legacy_xrc) (struct ibv_srq *ibv_srq); + void (*drv_exp_set_legacy_xrc) (struct ibv_srq *ibv_srq, void *legacy_xrc); + struct ibv_mr * (*drv_exp_ibv_reg_shared_mr)(struct ibv_exp_reg_shared_mr_in *in); + struct ibv_mr * (*lib_exp_ibv_reg_shared_mr)(struct ibv_exp_reg_shared_mr_in *in); + int (*drv_exp_modify_qp)(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, + uint64_t exp_attr_mask); + int (*lib_exp_modify_qp)(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, + uint64_t exp_attr_mask); + int (*drv_exp_post_task)(struct ibv_context *context, + struct ibv_exp_task *task, + struct ibv_exp_task **bad_task); + int (*lib_exp_post_task)(struct ibv_context *context, + struct ibv_exp_task *task, + struct ibv_exp_task **bad_task); + int (*drv_exp_modify_cq)(struct ibv_cq *cq, + struct ibv_exp_cq_attr *attr, int attr_mask); + int (*lib_exp_modify_cq)(struct ibv_cq *cq, + struct ibv_exp_cq_attr *attr, int attr_mask); + int (*drv_exp_ibv_destroy_flow) (struct ibv_exp_flow *flow); + int (*lib_exp_ibv_destroy_flow) (struct ibv_exp_flow *flow); + struct ibv_exp_flow * (*drv_exp_ibv_create_flow) (struct ibv_qp *qp, + struct ibv_exp_flow_attr + *flow_attr); + struct ibv_exp_flow * (*lib_exp_ibv_create_flow) (struct ibv_qp *qp, + struct ibv_exp_flow_attr + *flow_attr); + + int (*drv_exp_query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_exp_port_attr *port_attr); + int (*lib_exp_query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_exp_port_attr *port_attr); + struct ibv_exp_dct *(*create_dct)(struct ibv_context *context, + struct ibv_exp_dct_init_attr *attr); + int (*destroy_dct)(struct ibv_exp_dct *dct); + int (*query_dct)(struct ibv_exp_dct *dct, struct ibv_exp_dct_attr *attr); + int (*drv_exp_query_device)(struct ibv_context *context, + struct ibv_exp_device_attr *attr); + int (*lib_exp_query_device)(struct ibv_context *context, + struct ibv_exp_device_attr *attr); + struct ibv_qp *(*drv_exp_create_qp)(struct ibv_context *context, + struct ibv_exp_qp_init_attr *init_attr); + struct ibv_qp *(*lib_exp_create_qp)(struct ibv_context *context, + struct ibv_exp_qp_init_attr *init_attr); + size_t sz; /* Set by library on struct allocation, */ + /* must be located as last field */ +}; + + +static inline struct verbs_context_exp *verbs_get_exp_ctx(struct ibv_context *ctx) +{ + struct verbs_context *app_ex_ctx = verbs_get_ctx(ctx); + char *actual_ex_ctx; + + if (!app_ex_ctx || !(app_ex_ctx->has_comp_mask & VERBS_CONTEXT_EXP)) + return NULL; + + actual_ex_ctx = ((char *)ctx) - (app_ex_ctx->sz - sizeof(struct ibv_context)); + + return (struct verbs_context_exp *)(actual_ex_ctx - sizeof(struct verbs_context_exp)); +} + +#define verbs_get_exp_ctx_op(ctx, op) ({ \ + struct verbs_context_exp *_vctx = verbs_get_exp_ctx(ctx); \ + (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context_exp, op)) || \ + !_vctx->op) ? NULL : _vctx; }) + +#define verbs_set_exp_ctx_op(_vctx, op, ptr) ({ \ + struct verbs_context_exp *vctx = _vctx; \ + if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context_exp, op))) \ + vctx->op = ptr; }) + + +static inline struct ibv_qp * +ibv_exp_create_qp(struct ibv_context *context, struct ibv_exp_qp_init_attr *qp_init_attr) +{ + struct verbs_context_exp *vctx; + uint32_t mask = qp_init_attr->comp_mask; + + if (mask == IBV_EXP_QP_INIT_ATTR_PD) + return ibv_create_qp(qp_init_attr->pd, + (struct ibv_qp_init_attr *) qp_init_attr); + + vctx = verbs_get_exp_ctx_op(context, lib_exp_create_qp); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(qp_init_attr->comp_mask, + IBV_EXP_QP_INIT_ATTR_RESERVED1 - 1); + + return vctx->lib_exp_create_qp(context, qp_init_attr); +} + +/* + * ibv_exp_use_priv_env + * + * switch to use private environment + */ +static inline int ibv_exp_use_priv_env(struct ibv_context *context) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, lib_exp_use_priv_env); + if (!vctx) { + errno = ENOSYS; + return -1; + } + + return vctx->lib_exp_use_priv_env(context); +} + +/* + * ibv_exp_poll_dc_info + * + * The function is not thread safe. Any locking must be done by the user. + * + * Return: >= 0 number of returned entries + * < 0 error + * + */ +static inline int ibv_exp_poll_dc_info(struct ibv_context *context, + struct ibv_exp_dc_info_ent *ents, + int nent, int port) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, drv_exp_poll_dc_info); + if (!vctx) { + errno = ENOSYS; + return -1; + } + + return vctx->drv_exp_poll_dc_info(context, ents, nent, port); +} + +/* + * ibv_exp_setenv + * + * see man setenv for parameter description + */ +static inline int ibv_exp_setenv(struct ibv_context *context, + const char *name, + const char *value, + int overwrite) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, lib_exp_setenv); + if (!vctx) + return setenv(name, value, overwrite); + + return vctx->lib_exp_setenv(context, name, value, overwrite); +} + +static inline int ibv_exp_query_device(struct ibv_context *context, + struct ibv_exp_device_attr *attr) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(context, + lib_exp_query_device); + if (!vctx) + return ENOSYS; + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_DEVICE_ATTR_RESERVED - 1); + return vctx->lib_exp_query_device(context, attr); +} + +static inline struct ibv_exp_dct * +ibv_exp_create_dct(struct ibv_context *context, + struct ibv_exp_dct_init_attr *attr) +{ + struct verbs_context_exp *vctx; + struct ibv_exp_dct *dct; + + vctx = verbs_get_exp_ctx_op(context, create_dct); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_DCT_INIT_ATTR_RESERVED - 1); + pthread_mutex_lock(&context->mutex); + dct = vctx->create_dct(context, attr); + if (dct) + dct->context = context; + + pthread_mutex_unlock(&context->mutex); + + return dct; +} + +static inline int ibv_exp_destroy_dct(struct ibv_exp_dct *dct) +{ + struct verbs_context_exp *vctx; + struct ibv_context *context = dct->context; + int err; + + vctx = verbs_get_exp_ctx_op(context, destroy_dct); + if (!vctx) { + errno = ENOSYS; + return errno; + } + + pthread_mutex_lock(&context->mutex); + err = vctx->destroy_dct(dct); + pthread_mutex_unlock(&context->mutex); + + return err; +} + +static inline int ibv_exp_query_dct(struct ibv_exp_dct *dct, + struct ibv_exp_dct_attr *attr) +{ + struct verbs_context_exp *vctx; + struct ibv_context *context = dct->context; + int err; + + vctx = verbs_get_exp_ctx_op(context, query_dct); + if (!vctx) { + errno = ENOSYS; + return errno; + } + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_DCT_ATTR_RESERVED - 1); + pthread_mutex_lock(&context->mutex); + err = vctx->query_dct(dct, attr); + pthread_mutex_unlock(&context->mutex); + + return err; +} + +static inline int ibv_exp_arm_dct(struct ibv_exp_dct *dct, + struct ibv_exp_arm_attr *attr) +{ + struct verbs_context_exp *vctx; + struct ibv_context *context = dct->context; + int err; + + vctx = verbs_get_exp_ctx_op(context, lib_exp_arm_dct); + if (!vctx) { + errno = ENOSYS; + return errno; + } + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_ARM_ATTR_RESERVED - 1); + pthread_mutex_lock(&context->mutex); + err = vctx->lib_exp_arm_dct(dct, attr); + pthread_mutex_unlock(&context->mutex); + + return err; +} + +static inline int ibv_exp_query_port(struct ibv_context *context, + uint8_t port_num, + struct ibv_exp_port_attr *port_attr) +{ + struct verbs_context_exp *vctx; + + if (0 == port_attr->comp_mask) + return ibv_query_port(context, port_num, + &port_attr->port_attr); + + /* Check that only valid flags were given */ + if ((!port_attr->comp_mask & IBV_EXP_QUERY_PORT_ATTR_MASK1) || + (port_attr->comp_mask & ~IBV_EXP_QUERY_PORT_ATTR_MASKS) || + (port_attr->mask1 & ~IBV_EXP_QUERY_PORT_MASK)) { + errno = EINVAL; + return -errno; + } + + vctx = verbs_get_exp_ctx_op(context, lib_exp_query_port); + + if (!vctx) { + /* Fallback to legacy mode */ + if (port_attr->comp_mask == IBV_EXP_QUERY_PORT_ATTR_MASK1 && + !(port_attr->mask1 & ~IBV_EXP_QUERY_PORT_STD_MASK)) + return ibv_query_port(context, port_num, + &port_attr->port_attr); + + /* Unsupported field was requested */ + errno = ENOSYS; + return -errno; + } + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(port_attr->comp_mask, + IBV_EXP_QUERY_PORT_ATTR_RESERVED - 1); + + return vctx->lib_exp_query_port(context, port_num, port_attr); +} + +/** + * ibv_exp_post_task - Post a list of tasks to different QPs. + */ +static inline int ibv_exp_post_task(struct ibv_context *context, + struct ibv_exp_task *task, + struct ibv_exp_task **bad_task) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(context, + lib_exp_post_task); + if (!vctx) + return ENOSYS; + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(task->comp_mask, + IBV_EXP_TASK_RESERVED - 1); + + return vctx->lib_exp_post_task(context, task, bad_task); +} + +static inline int ibv_exp_query_values(struct ibv_context *context, int q_values, + struct ibv_exp_values *values) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(context, + drv_exp_query_values); + if (!vctx) { + errno = ENOSYS; + return -errno; + } + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(values->comp_mask, + IBV_EXP_VALUES_RESERVED - 1); + + return vctx->drv_exp_query_values(context, q_values, values); +} + +static inline struct ibv_exp_flow *ibv_exp_create_flow(struct ibv_qp *qp, + struct ibv_exp_flow_attr *flow) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(qp->context, + lib_exp_ibv_create_flow); + if (!vctx || !vctx->lib_exp_ibv_create_flow) + return NULL; + + if (flow->reserved != 0L) { + fprintf(stderr, "%s:%d: flow->reserved must be 0\n", __FUNCTION__, __LINE__); + flow->reserved = 0L; + } + + return vctx->lib_exp_ibv_create_flow(qp, flow); +} + +static inline int ibv_exp_destroy_flow(struct ibv_exp_flow *flow_id) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(flow_id->context, + lib_exp_ibv_destroy_flow); + if (!vctx || !vctx->lib_exp_ibv_destroy_flow) + return -ENOSYS; + + return vctx->lib_exp_ibv_destroy_flow(flow_id); +} + +static inline int ibv_exp_poll_cq(struct ibv_cq *ibcq, int num_entries, + struct ibv_exp_wc *wc, uint32_t wc_size) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(ibcq->context, + drv_exp_ibv_poll_cq); + if (!vctx) + return -ENOSYS; + + return vctx->drv_exp_ibv_poll_cq(ibcq, num_entries, wc, wc_size); +} + +/** + * ibv_exp_post_send - Post a list of work requests to a send queue. + */ +static inline int ibv_exp_post_send(struct ibv_qp *qp, + struct ibv_exp_send_wr *wr, + struct ibv_exp_send_wr **bad_wr) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(qp->context, + drv_exp_post_send); + if (!vctx) + return -ENOSYS; + + return vctx->drv_exp_post_send(qp, wr, bad_wr); +} + +/** + * ibv_exp_reg_shared_mr - Register to an existing shared memory region + * @in - Experimental register shared MR input data. + */ +static inline struct ibv_mr *ibv_exp_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *mr_in) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(mr_in->pd->context, + lib_exp_ibv_reg_shared_mr); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(mr_in->comp_mask, + IBV_EXP_REG_SHARED_MR_RESERVED - 1); + + return vctx->lib_exp_ibv_reg_shared_mr(mr_in); +} + +/** + * ibv_exp_modify_cq - Modifies the attributes for the specified CQ. + * @cq: The CQ to modify. + * @cq_attr: Specifies the CQ attributes to modify. + * @cq_attr_mask: A bit-mask used to specify which attributes of the CQ + * are being modified. + */ +static inline int ibv_exp_modify_cq(struct ibv_cq *cq, + struct ibv_exp_cq_attr *cq_attr, + int cq_attr_mask) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(cq->context, + lib_exp_modify_cq); + if (!vctx) + return ENOSYS; + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(cq_attr->comp_mask, + IBV_EXP_CQ_ATTR_RESERVED - 1); + + return vctx->lib_exp_modify_cq(cq, cq_attr, cq_attr_mask); +} + +static inline struct ibv_cq *ibv_exp_create_cq(struct ibv_context *context, + int cqe, + void *cq_context, + struct ibv_comp_channel *channel, + int comp_vector, + struct ibv_exp_cq_init_attr *attr) +{ + struct verbs_context_exp *vctx; + struct ibv_cq *cq; + + vctx = verbs_get_exp_ctx_op(context, exp_create_cq); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_CQ_INIT_ATTR_RESERVED1 - 1); + pthread_mutex_lock(&context->mutex); + cq = vctx->exp_create_cq(context, cqe, channel, comp_vector, attr); + if (cq) { + cq->context = context; + cq->channel = channel; + if (channel) + ++channel->refcnt; + cq->cq_context = cq_context; + cq->comp_events_completed = 0; + cq->async_events_completed = 0; + pthread_mutex_init(&cq->mutex, NULL); + pthread_cond_init(&cq->cond, NULL); + } + + pthread_mutex_unlock(&context->mutex); + + return cq; +} + +/** + * ibv_exp_modify_qp - Modify a queue pair. + * The argument exp_attr_mask specifies the QP attributes to be modified. + * Use ibv_exp_qp_attr_mask for this argument. + */ +static inline int +ibv_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, uint64_t exp_attr_mask) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(qp->context, lib_exp_modify_qp); + if (!vctx) { + errno = ENOSYS; + return errno; + } + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_QP_ATTR_RESERVED - 1); + + return vctx->lib_exp_modify_qp(qp, attr, exp_attr_mask); +} + +/** + * ibv_exp_reg_mr - Register a memory region + */ +static inline struct ibv_mr *ibv_exp_reg_mr(struct ibv_exp_reg_mr_in *in) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(in->pd->context, lib_exp_reg_mr); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(in->comp_mask, + IBV_EXP_REG_MR_RESERVED - 1); + + return vctx->lib_exp_reg_mr(in); +} + + +/** + * ibv_exp_bind_mw - Bind a memory window to a region + */ +static inline int ibv_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(mw_bind->mw->context, lib_exp_bind_mw); + if (!vctx) { + errno = ENOSYS; + return errno; + } + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(mw_bind->comp_mask, + IBV_EXP_BIND_MW_RESERVED - 1); + + return vctx->lib_exp_bind_mw(mw_bind); +} + +/** + * ibv_exp_prefetch_mr - Prefetch part of a memory region. + * + * Can be used only with MRs registered with IBV_EXP_ACCESS_ON_DEMAND + * + * Returns 0 on success, + * - ENOSYS libibverbs or provider driver doesn't support the prefetching verb. + * - EFAULT when the range requested is out of the memory region bounds, or when + * parts of it are not part of the process address space. + * - EINVAL when the MR is invalid. + */ +static inline int ibv_exp_prefetch_mr( + struct ibv_mr *mr, + struct ibv_exp_prefetch_attr *attr) +{ + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(mr->context, + lib_exp_prefetch_mr); + + if (!vctx) { + errno = ENOSYS; + return errno; + } + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_PREFETCH_MR_RESERVED - 1); + + return vctx->lib_exp_prefetch_mr(mr, attr); +} + +typedef int (*drv_exp_post_send_func)(struct ibv_qp *qp, + struct ibv_exp_send_wr *wr, + struct ibv_exp_send_wr **bad_wr); +typedef int (*drv_post_send_func)(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +typedef int (*drv_exp_poll_cq_func)(struct ibv_cq *ibcq, int num_entries, + struct ibv_exp_wc *wc, uint32_t wc_size); +typedef int (*drv_poll_cq_func)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); +typedef int (*drv_post_recv_func)(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +static inline void *ibv_exp_get_provider_func(struct ibv_context *context, + enum ibv_exp_func_name name) +{ + struct verbs_context_exp *vctx; + + switch (name) { + case IBV_EXP_POST_SEND_FUNC: + vctx = verbs_get_exp_ctx_op(context, drv_exp_post_send); + if (!vctx) + goto error; + + return (void *)vctx->drv_exp_post_send; + + case IBV_EXP_POLL_CQ_FUNC: + vctx = verbs_get_exp_ctx_op(context, drv_exp_ibv_poll_cq); + if (!vctx) + goto error; + + return (void *)vctx->drv_exp_ibv_poll_cq; + + case IBV_POST_SEND_FUNC: + if (!context->ops.post_send) + goto error; + + return (void *)context->ops.post_send; + + case IBV_POLL_CQ_FUNC: + if (!context->ops.poll_cq) + goto error; + + return (void *)context->ops.poll_cq; + + case IBV_POST_RECV_FUNC: + if (!context->ops.post_recv) + goto error; + + return (void *)context->ops.post_recv; + + default: + break; + } + +error: + errno = ENOSYS; + return NULL; +} + +static inline struct ibv_mr *ibv_exp_create_mr(struct ibv_exp_create_mr_in *in) +{ + struct verbs_context_exp *vctx; + struct ibv_mr *mr; + + vctx = verbs_get_exp_ctx_op(in->pd->context, lib_exp_create_mr); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(in->comp_mask, + IBV_EXP_CREATE_MR_IN_RESERVED - 1); + mr = vctx->lib_exp_create_mr(in); + if (mr) + mr->pd = in->pd; + + return mr; +} + +static inline int ibv_exp_query_mkey(struct ibv_mr *mr, + struct ibv_exp_mkey_attr *attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(mr->context, lib_exp_query_mkey); + if (!vctx) { + errno = ENOSYS; + return errno; + } + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_MKEY_ATTR_RESERVED - 1); + + return vctx->lib_exp_query_mkey(mr, attr); +} + +static inline int ibv_exp_dealloc_mkey_list_memory(struct ibv_exp_mkey_list_container *mem) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(mem->context, + lib_exp_dealloc_mkey_list_memory); + if (!vctx) { + errno = ENOSYS; + return errno; + } + + return vctx->lib_exp_dealloc_mkey_list_memory(mem); +} + +static inline struct ibv_exp_mkey_list_container * +ibv_exp_alloc_mkey_list_memory(struct ibv_exp_mkey_list_container_attr *attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(attr->pd->context, + lib_exp_alloc_mkey_list_memory); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_MKEY_LIST_CONTAINER_RESERVED - 1); + + return vctx->lib_exp_alloc_mkey_list_memory(attr); +} + +/** + * ibv_rereg_mr - Re-Register a memory region + * + * For exp_access use ibv_exp_access_flags + */ +static inline int ibv_exp_rereg_mr(struct ibv_mr *mr, int flags, + struct ibv_pd *pd, void *addr, + size_t length, uint64_t exp_access, + struct ibv_exp_rereg_mr_attr *attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(mr->context, exp_rereg_mr); + if (!vctx) + return errno = ENOSYS; + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_REREG_MR_ATTR_RESERVED - 1); + + return vctx->exp_rereg_mr(mr, flags, pd, addr, length, exp_access, attr); +} + +/** + * ibv_exp_create_res_domain - create resource domain + */ +static inline struct ibv_exp_res_domain *ibv_exp_create_res_domain(struct ibv_context *context, + struct ibv_exp_res_domain_init_attr *attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, exp_create_res_domain); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_RES_DOMAIN_RESERVED - 1); + + return vctx->exp_create_res_domain(context, attr); +} + +/** + * ibv_exp_destroy_res_domain - destroy resource domain + */ +static inline int ibv_exp_destroy_res_domain(struct ibv_context *context, + struct ibv_exp_res_domain *res_dom, + struct ibv_exp_destroy_res_domain_attr *attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, exp_destroy_res_domain); + if (!vctx) + return errno = ENOSYS; + + if (attr) + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_DESTROY_RES_DOMAIN_RESERVED - 1); + + return vctx->exp_destroy_res_domain(context, res_dom, attr); +} + +/** + * ibv_exp_query_intf - query for family of verbs interface for specific QP/CQ + * + * Usually family of data-path verbs. + * Application may call ibv_exp_query_intf for QPs in the following states: + * IBV_QPS_INIT, IBV_QPS_RTR and IBV_QPS_RTS + * + * Returns the family of verbs. + * On failure returns NULL. The failure reason provided by the 'status' + * output variable. + */ +static inline void *ibv_exp_query_intf(struct ibv_context *context, + struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, exp_query_intf); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(params->comp_mask, + IBV_EXP_QUERY_INTF_RESERVED - 1); + + return vctx->exp_query_intf(context, params, status); +} + +/** + * ibv_exp_release_intf - release the queried interface + */ +static inline int ibv_exp_release_intf(struct ibv_context *context, void *intf, + struct ibv_exp_release_intf_params *params) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, exp_release_intf); + if (!vctx) + return errno = ENOSYS; + + if (params) + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(params->comp_mask, + IBV_EXP_RELEASE_INTF_RESERVED - 1); + + return vctx->exp_release_intf(context, intf, params); +} + +static inline struct ibv_exp_wq *ibv_exp_create_wq(struct ibv_context *context, + struct ibv_exp_wq_init_attr *wq_init_attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, exp_create_wq); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(wq_init_attr->comp_mask, + IBV_EXP_CREATE_WQ_RESERVED - 1); + + return vctx->exp_create_wq(context, wq_init_attr); +} + +static inline int ibv_exp_modify_wq(struct ibv_exp_wq *wq, struct ibv_exp_wq_attr *wq_attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(wq->context, exp_modify_wq); + if (!vctx) + return ENOSYS; + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(wq_attr->attr_mask, + IBV_EXP_WQ_ATTR_RESERVED - 1); + return vctx->exp_modify_wq(wq, wq_attr); +} + +static inline int ibv_exp_destroy_wq(struct ibv_exp_wq *wq) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(wq->context, exp_destroy_wq); + if (!vctx) + return ENOSYS; + + return vctx->exp_destroy_wq(wq); +} + +/* + * ibv_exp_create_rwq_ind_table - Creates a RQ Indirection Table associated + * with the specified protection domain. + * @pd: The protection domain associated with the Indirection Table. + * @ibv_exp_rwq_ind_table_init_attr: A list of initial attributes required to + * create the Indirection Table. + * Return Value + * ibv_exp_create_rwq_ind_table returns a pointer to the created + * Indirection Table, or NULL if the request fails. + */ +static inline struct ibv_exp_rwq_ind_table *ibv_exp_create_rwq_ind_table(struct ibv_context *context, + struct ibv_exp_rwq_ind_table_init_attr *init_attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, exp_create_rwq_ind_table); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + + IBV_EXP_RET_NULL_ON_INVALID_COMP_MASK(init_attr->comp_mask, + IBV_EXP_CREATE_IND_TABLE_RESERVED - 1); + return vctx->exp_create_rwq_ind_table(context, init_attr); +} + +/* + * ibv_exp_destroy_rwq_ind_table - Destroys the specified Indirection Table. + * @rwq_ind_table: The Indirection Table to destroy. + * Return Value + * ibv_destroy_rwq_ind_table() returns 0 on success, or the value of errno + * on failure (which indicates the failure reason). +*/ +static inline int ibv_exp_destroy_rwq_ind_table(struct ibv_exp_rwq_ind_table *rwq_ind_table) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(rwq_ind_table->context, exp_destroy_rwq_ind_table); + if (!vctx) + return ENOSYS; + + return vctx->exp_destroy_rwq_ind_table(rwq_ind_table); +} + +/* + * ibv_exp_query_gid_attr - query a GID attributes + * @context: ib context + * @port_num: port number + * @index: gid index in the gids table + * @attr: the gid attributes of index in the gids table + * Return value + * ibv_exp_query_gid_attr return 0 on success, or the value of errno on failure. + */ +static inline int ibv_exp_query_gid_attr(struct ibv_context *context, + uint8_t port_num, + unsigned int index, + struct ibv_exp_gid_attr *attr) +{ + struct verbs_context_exp *vctx; + + vctx = verbs_get_exp_ctx_op(context, exp_query_gid_attr); + if (!vctx) + return ENOSYS; + + IBV_EXP_RET_EINVAL_ON_INVALID_COMP_MASK(attr->comp_mask, + IBV_EXP_QUERY_GID_ATTR_RESERVED - 1); + return vctx->exp_query_gid_attr(context, port_num, index, attr); +} +END_C_DECLS + +#define VERBS_MAX_ENV_VAL 4096 + +# undef __attribute_const + + +#endif /* INFINIBAND_VERBS_EXP_H */ Index: contrib/ofed/libibverbs/libibverbs.spec.in =================================================================== --- contrib/ofed/libibverbs/libibverbs.spec.in +++ contrib/ofed/libibverbs/libibverbs.spec.in @@ -1,15 +1,30 @@ +### +%{!?configure_options: %define configure_options %{nil}} +%{!?_with_valgrind: %define _with_valgrind 0} +%{!?_disable_valgrind: %define _disable_valgrind 0} + +%if 0%{?rhel} == 6 +%if 0%{_disable_valgrind} == 0 +%define _with_valgrind 1 +%endif +%endif +### + Name: libibverbs -Version: 1.1.4 +Version: 1.1.8mlnx1 Release: 1%{?dist} Summary: A library for direct userspace use of RDMA (InfiniBand/iWARP) hardware Group: System Environment/Libraries License: GPLv2 or BSD Url: http://openfabrics.org/ -Source: http://openfabrics.org/downloads/verbs/libibverbs-1.1.4.tar.gz +Source: http://openfabrics.org/downloads/verbs/libibverbs-%{version}.tar.gz BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) Requires(post): /sbin/ldconfig Requires(postun): /sbin/ldconfig +%if %{_with_valgrind} +BuildRequires: valgrind-devel +%endif %description libibverbs is a library that allows userspace processes to use RDMA @@ -25,6 +40,7 @@ Summary: Development files for the libibverbs library Group: System Environment/Libraries Requires: %{name} = %{version}-%{release} +Provides: %{_prefix}/include/infiniband/verbs.h %description devel Header files for the libibverbs library. @@ -49,12 +65,24 @@ %setup -q -n %{name}-@VERSION@ %build -%configure +%if %{_with_valgrind} +%configure %{configure_options} --libdir=%{_libdir}/mlnx_ofed/valgrind --with-valgrind +make %{?_smp_mflags} +make DESTDIR=$RPM_BUILD_DIR/%{name}-%{version}/valgrind install +rm -f $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind/*.*a +make clean +%endif + +%configure %{configure_options} make %{?_smp_mflags} %install rm -rf $RPM_BUILD_ROOT make DESTDIR=%{buildroot} install +%if %{_with_valgrind} +mkdir -p %{buildroot}/%{_libdir}/mlnx_ofed +cp -a $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind %{buildroot}/%{_libdir}/mlnx_ofed +%endif # remove unpackaged files from the buildroot rm -f $RPM_BUILD_ROOT%{_libdir}/*.la @@ -67,11 +95,17 @@ %files %defattr(-,root,root,-) %{_libdir}/libibverbs*.so.* +%if %{_with_valgrind} +%{_libdir}/mlnx_ofed/valgrind/libibverbs*.so.* +%endif %doc AUTHORS COPYING ChangeLog README %files devel %defattr(-,root,root,-) %{_libdir}/lib*.so +%if %{_with_valgrind} +%{_libdir}/mlnx_ofed/valgrind/lib*.so +%endif %{_includedir}/* %{_mandir}/man3/* %{_mandir}/man7/* @@ -86,6 +120,12 @@ %{_mandir}/man1/* %changelog +* Wed Dec 21 2011 Roland Dreier - 1.1.6-1 +- New upstream release + +* Tue Jun 28 2011 Roland Dreier - 1.1.5-1 +- New upstream release + * Thu Jun 3 2010 Roland Dreier - 1.1.4-1 - New upstream release @@ -103,6 +143,12 @@ - Update description to mention RDMA and iWARP, not just InfiniBand - Add "Requires" tag for libibverbs base package to -devel +* Mon Feb 18 2008 Fedora Release Engineering - 1.1.1-3 +- Autorebuild for GCC 4.3 + +* Tue Aug 28 2007 Fedora Release Engineering - 1.1.1-2 +- Rebuild for selinux ppc32 issue. + * Fri Jun 15 2007 Roland Dreier - 1.1.1-1 - New upstream release @@ -121,7 +167,7 @@ * Thu May 4 2006 Roland Dreier - 1.0.4-1 - New upstream release -* Mon Mar 14 2006 Roland Dreier - 1.0.3-1 +* Tue Mar 14 2006 Roland Dreier - 1.0.3-1 - New upstream release * Mon Mar 13 2006 Roland Dreier - 1.0.1-1 Index: contrib/ofed/libibverbs/man/ibv_alloc_mw.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_alloc_mw.3 @@ -0,0 +1,53 @@ +.\" -*- nroff -*- +.\" +.TH IBV_ALLOC_MW 3 20012-06-20 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_alloc_mw, ibv_dealloc_mw \- allocate or deallocate a memory window (MW) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_mw *ibv_alloc_mw(struct ibv_pd " "*pd" , +.BI " enum ibv_mw_type " "type"); +.sp +.BI "int ibv_dealloc_mw(struct ibv_mw " "*mw" ); +.fi +.SH "DESCRIPTION" +.B ibv_alloc_mw() +allocates a memory region (MW) associated with the protection domain +.I pd\fR. +The MW's type (1 or 2A/2B) is +.I type\fR. +.PP +The MW is created not unbounded. For it to be useful, the MW must be bounded, through either ibv_bind_mw (type 1) or a special WQE (type 2). Once bounded, the memory window allows RDMA (remote) access to a subset of the MR to which it was bounded, until invalidated/unbounded/deallocated. +.PP +.B ibv_dealloc_mw() +Unbinds and deallocates the MW +.I mw\fR. +.SH "RETURN VALUE" +.B ibv_alloc_mw() +returns a pointer to the registered MW, or NULL if the request fails. +The remote key (\fBR_Key\fR) +field +.B rkey +is used by remote processes to perform Atomic and RDMA operations. This key will be changed during bind operations. The remote process places this +.B rkey +as the rkey field of struct ibv_send_wr passed to the ibv_post_send function. +.PP +.B ibv_dealloc_mw() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_dereg_mr() +fails if any memory window is still bound to this MR. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_post_send (3), +.BR ibv_bind_mw (3), +.BR ibv_reg_mr (3), +.SH "AUTHORS" +.TP +Haggai Eran +.TP +Shachar Raindel +.TP +Yaniv Saar Index: contrib/ofed/libibverbs/man/ibv_alloc_pd.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_alloc_pd.3 +++ contrib/ofed/libibverbs/man/ibv_alloc_pd.3 @@ -37,4 +37,4 @@ .BR ibv_create_ah_from_wc (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_attach_mcast.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_attach_mcast.3 +++ contrib/ofed/libibverbs/man/ibv_attach_mcast.3 @@ -50,4 +50,4 @@ .BR ibv_create_qp (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_cc_pingpong.1 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_cc_pingpong.1 @@ -0,0 +1,79 @@ +.TH IBV_CC_PINGPONG 1 2013-03-10 "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_cc_pingpong \- ping-pong test demonstrates using of WAIT and CALC work requests. + +.SH SYNOPSIS +.B ibv_cc_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-e] [\-m mtu] +[\-c calc] [\-t op_type] [\-o operands] [\-w wait] +\fBHOSTNAME\fR + +.B ibv_cc_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-e] [\-m mtu] +[\-c calc] [\-t op_type] [\-o operands] [\-w wait] + +.SH DESCRIPTION +.PP +Run a simple ping-pong test over InfiniBand via the reliable +connected (RC) transport using WAIT on CQ work request +and CALC work request. + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +use IB port \fIPORT\fR (default port 1) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR +ping-pong messages of size \fISIZE\fR (default 4096) +.TP +\fB\-r\fR, \fB\-\-rx\-depth\fR=\fIDEPTH\fR +post \fIDEPTH\fR receives at a time (default 1000) +.TP +\fB\-n\fR, \fB\-\-iters\fR=\fIITERS\fR +perform \fIITERS\fR message exchanges (default 1000) +.TP +\fB\-l\fR, \fB\-\-sl\fR=\fISL\fR +use \fISL\fR as the service level value of the QP (default 0) +.TP +\fB\-e\fR, \fB\-\-events\fR +sleep while waiting for work completion events (default is to poll for +completions) +.TP +\fB\-m\fR, \fB\-\-mtu\fR=\fISIZE\fR +path MTU (default 4096) +.TP +\fB\-c\fR, \fB\-\-calc\fR=\fIOPERATION\fR +calc operation +.TP +\fB\-t\fR, \fB\-\-op_type\fR=\fITYPE\fR +calc operands type +.TP +\fB\-o\fR, \fB\-\-operands\fR=\fIO1,O2...\fR +comma separated list of operands +.TP +\fB\-w\fR, \fB\-\-wait_cq\fR=\fIWAIT\fR +wait for enties on CQ + +.SH SEE ALSO +.BR ibv_uc_pingpong (1), +.BR ibv_ud_pingpong (1), +.BR ibv_srq_pingpong (1) + +.SH AUTHORS +.TP +Igor Ivanov +.RI < Igor.Ivanov@itseez.com > +.TP +Roland Dreier +.RI < rolandd@cisco.com > Index: contrib/ofed/libibverbs/man/ibv_create_ah.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_create_ah.3 +++ contrib/ofed/libibverbs/man/ibv_create_ah.3 @@ -61,4 +61,4 @@ .BR ibv_create_ah_from_wc (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_create_ah_from_wc.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_create_ah_from_wc.3 +++ contrib/ofed/libibverbs/man/ibv_create_ah_from_wc.3 @@ -60,4 +60,4 @@ .BR ibv_poll_cq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_create_comp_channel.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_create_comp_channel.3 +++ contrib/ofed/libibverbs/man/ibv_create_comp_channel.3 @@ -47,4 +47,4 @@ .BR ibv_get_cq_event (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_create_cq.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_create_cq.3 +++ contrib/ofed/libibverbs/man/ibv_create_cq.3 @@ -55,4 +55,4 @@ .BR ibv_create_qp (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_create_flow.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_create_flow.3 @@ -0,0 +1,87 @@ +.TH IBV_CREATE_FLOW 3 2013-08-21 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_flow, ibv_destroy_flow \- create or destroy flow steering rules +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_flow *ibv_create_flow(struct ibv_qp " "*qp" , +.BI " struct ibv_flow_attr " "*flow"); +.BI "int ibv_destroy_flow(struct ibv_flow " "*flow_id"); +.sp +.fi +.SH "DESCRIPTION" +.SS ibv_create_flow() +allows a user application QP +.I qp +to be attached into a specified flow +.I flow +which is defined in +.I +.PP +.nf +struct ibv_flow_attr { +.in +8 +uint32_t comp_mask; /* Future extendibility */ +enum ibv_flow_attr_type type; /* Rule type - see below */ +uint16_t size; /* Size of command */ +uint16_t priority; /* Rule priority - See below */ +uint8_t num_of_specs; /* Number of ibv_flow_spec_xxx */ +uint8_t port; /* The uplink port number */ +uint32_t flags; /* Extra flags for rule - see below */ +/* Following are the optional layers according to user request + * struct ibv_flow_spec_xxx + * struct ibv_flow_spec_yyy + */ +.in -8 +}; +.sp +.nf +enum ibv_flow_attr_type { +.in +8 +IBV_FLOW_ATTR_NORMAL = 0x0, /* steering according to rule specifications */ +IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default unicast and multicast rule - receive all Eth traffic which isn't steered to any QP */ +IBV_FLOW_ATTR_MC_DEFAULT = 0x2, /* default multicast rule - receive all Eth multicast traffic which isn't steered to any QP */ +IBV_FLOW_ATTR_SNIFFER = 0x3, /* sniffer rule - receive all port traffic */ +.in -8 +}; +.sp +.nf +enum ibv_flow_flags { +.in +8 +IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, /* Apply the rules on packets that were sent from the attached QP through loopback. IB only.*/ +.in -8 +}; +.fi +.PP +Each header struct holds the relevant network layer parameters for matching.To enforce the match, the +user sets a mask for each parameter. If the bit is set in the mask, the corresponding bit in the value should be matched. +.br +Note that most vendors support either full mask (all "1"s) or zero mask (all "0"s). +.br +.B Network paramters in the relevant network structs should be given in network order (big endian). + +.SS Flow domains and priority +Flow steering defines the concept of domain and priority. Each domain represents a user agent that can attach a flow. The domains are prioritized. A higher priority domain will always supersede a lower priority domain when their flow specifications overlap. In addition to the domain, there is priority within each of the domains. Each domain has at most 2^12 priorities. A lower priority numeric value (higher priority) takes precedence over matching rules with higher numeric priority value (lower priority). It is important to note that the priority value of a flow spec is used not only to establish the precedence of conflicting flow matches but also as a way to abstract the order on which flow specs are tested for matches. Flows with higher priorities will be tested before flows with lower priorities. +.br +.B IB verbs have the higher priority domain. +.PP +.SS ibv_destroy_flow() +destroys the flow +.I flow_id\fR. +.SH "RETURN VALUE" +.B ibv_create_flow() +returns a pointer to the flow, or NULL if the request fails. +.PP +.B ibv_destroy_flow() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +These verbs are available only for devices supporting IBV_DEVICE_MANAGED_FLOW_STEERING and +only for QPs of Transport Service Type +.BR IBV_QPT_UD +or +.BR IBV_QPT_RAW_PACKET +.PP +.SH "AUTHORS" +.TP +Matan Barak Hadar Hen Zion Index: contrib/ofed/libibverbs/man/ibv_create_qp.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_create_qp.3 +++ contrib/ofed/libibverbs/man/ibv_create_qp.3 @@ -28,9 +28,8 @@ struct ibv_cq *recv_cq; /* CQ to be associated with the Receive Queue (RQ) */ struct ibv_srq *srq; /* SRQ handle if QP is to be associated with an SRQ, otherwise NULL */ struct ibv_qp_cap cap; /* QP capabilities */ -enum ibv_qp_type qp_type; /* QP Transport Service Type: IBV_QPT_RC, IBV_QPT_UC, IBV_QPT_UD or IBV_QPT_XRC */ +enum ibv_qp_type qp_type; /* QP Transport Service Type: IBV_QPT_RC, IBV_QPT_UC, IBV_QPT_UD or IBV_QPT_RAW_PACKET */ int sq_sig_all; /* If set, each Work Request (WR) submitted to the SQ generates a completion entry */ -struct ibv_xrc_domain *xrc_domain; /* XRC domain the QP will be associated with (valid only for IBV_QPT_XRC QP), otherwise NULL */ .in -8 }; .sp @@ -64,12 +63,6 @@ .B ibv_destroy_qp() returns 0 on success, or the value of errno on failure (which indicates the failure reason). .SH "NOTES" -.B ibv_create_qp() -will fail if a it is asked to create QP of a type other than -.B IBV_QPT_RC -or -.B IBV_QPT_UD -associated with an SRQ. .PP The attributes max_recv_wr and max_recv_sge are ignored by .B ibv_create_qp() @@ -83,4 +76,4 @@ .BR ibv_query_qp (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_create_qp_ex.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_create_qp_ex.3 @@ -0,0 +1,86 @@ +.\" -*- nroff -*- +.\" +.TH IBV_CREATE_QP_EX 3 2014-04-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_qp_ex, ibv_destroy_qp \- create or destroy a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_qp *ibv_create_qp_ex(struct ibv_context " "*context" , +.BI " struct ibv_qp_init_attr_ex " "*qp_init_attr_ex" ); +.sp +.BI "int ibv_destroy_qp(struct ibv_qp " "*qp" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_qp_ex() +creates a queue pair (QP) associated with the protection domain +.I pd\fR. +The argument +.I qp_init_attr_ex +is an ibv_qp_init_attr_ex struct, as defined in . +.PP +.nf +struct ibv_qp_init_attr_ex { +.in +8 +void *qp_context; /* Associated context of the QP */ +struct ibv_cq *send_cq; /* CQ to be associated with the Send Queue (SQ) */ +struct ibv_cq *recv_cq; /* CQ to be associated with the Receive Queue (RQ) */ +struct ibv_srq *srq; /* SRQ handle if QP is to be associated with an SRQ, otherwise NULL */ +struct ibv_qp_cap cap; /* QP capabilities */ +enum ibv_qp_type qp_type; /* QP Transport Service Type: IBV_QPT_RC, IBV_QPT_UC, IBV_QPT_UD or IBV_QPT_RAW_PACKET */ +int sq_sig_all; /* If set, each Work Request (WR) submitted to the SQ generates a completion entry */ +uint32_t comp_mask; /* Identifies valid fields */ +struct ibv_pd *pd; /* PD to be associated with the QP */ +struct ibv_xrcd *xrcd; /* XRC domain to be associated with the target QP */ +.in -8 +}; +.sp +.nf +struct ibv_qp_cap { +.in +8 +uint32_t max_send_wr; /* Requested max number of outstanding WRs in the SQ */ +uint32_t max_recv_wr; /* Requested max number of outstanding WRs in the RQ */ +uint32_t max_send_sge; /* Requested max number of scatter/gather (s/g) elements in a WR in the SQ */ +uint32_t max_recv_sge; /* Requested max number of s/g elements in a WR in the SQ */ +uint32_t max_inline_data;/* Requested max number of data (bytes) that can be posted inline to the SQ, otherwise 0 */ +.in -8 +}; +.fi +.PP +The function +.B ibv_create_qp_ex() +will update the +.I qp_init_attr_ex\fB\fR->cap +struct with the actual \s-1QP\s0 values of the QP that was created; +the values will be greater than or equal to the values requested. +.PP +.B ibv_destroy_qp() +destroys the QP +.I qp\fR. +.SH "RETURN VALUE" +.B ibv_create_qp_ex() +returns a pointer to the created QP, or NULL if the request fails. +Check the QP number (\fBqp_num\fR) in the returned QP. +.PP +.B ibv_destroy_qp() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.PP +The attributes max_recv_wr and max_recv_sge are ignored by +.B ibv_create_qp_ex() +if the QP is to be associated with an SRQ. +.PP +.B ibv_destroy_qp() +fails if the QP is attached to a multicast group. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_qp (3), +.BR ibv_query_qp (3) +.SH "AUTHORS" +.TP +Yishai Hadas +.TP +Majd Dibbiny +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_create_srq.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_create_srq.3 +++ contrib/ofed/libibverbs/man/ibv_create_srq.3 @@ -10,26 +10,12 @@ .BI "struct ibv_srq *ibv_create_srq(struct ibv_pd " "*pd" ", struct " .BI " ibv_srq_init_attr " "*srq_init_attr" ); .sp -.BI "struct ibv_srq *ibv_create_xrc_srq(struct ibv_pd " "*pd" ", -.BI " struct ibv_xrc_domain " "*xrc_domain" ", -.BI " struct ibv_cq " "*xrc_cq" ", -.BI " struct ibv_srq_init_attr " "*srq_init_attr" ); -.sp .BI "int ibv_destroy_srq(struct ibv_srq " "*srq" ); .fi .SH "DESCRIPTION" .B ibv_create_srq() creates a shared receive queue (SRQ) associated with the protection domain .I pd\fR. -.PP -.B ibv_create_xrc_srq() -creates an XRC shared receive queue (SRQ) associated with the protection domain -.I pd\fR, -the XRC domain -.I xrc_domain -and the CQ which will hold the XRC completion -.I xrc_cq\fR. -.PP The argument .I srq_init_attr is an ibv_srq_init_attr struct, as defined in . @@ -78,4 +64,4 @@ .BR ibv_query_srq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_create_srq_ex.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_create_srq_ex.3 @@ -0,0 +1,71 @@ +.\" -*- nroff -*- +.\" +.TH IBV_CREATE_SRQ_EX 3 2013-06-26 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_srq_ex, ibv_destroy_srq \- create or destroy a shared receive queue (SRQ) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_srq *ibv_create_srq_ex(struct ibv_context " "*context" ", struct " +.BI " ibv_srq_init_attr_ex " "*srq_init_attr_ex" ); +.sp +.BI "int ibv_destroy_srq(struct ibv_srq " "*srq" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_srq_ex() +creates a shared receive queue (SRQ) supporting both basic and xrc modes. +The argument +.I srq_init_attr_ex +is an ibv_srq_init_attr_ex struct, as defined in . +.PP +.nf +struct ibv_srq_init_attr_ex { +.in +8 +void *srq_context; /* Associated context of the SRQ */ +struct ibv_srq_attr attr; /* SRQ attributes */ +uint32_t comp_mask; /* Identifies valid fields */ +enum ibv_srq_type srq_type; /* Basic / XRC */ +struct ibv_pd *pd; /* PD associated with the SRQ */ +struct ibv_xrcd *xrcd; /* XRC domain to associate with the SRQ */ +struct ibv_cq *cq; /* CQ to associate with the SRQ for XRC mode */ +.in -8 +}; +.sp +.nf +struct ibv_srq_attr { +.in +8 +uint32_t max_wr; /* Requested max number of outstanding work requests (WRs) in the SRQ */ +uint32_t max_sge; /* Requested max number of scatter elements per WR */ +uint32_t srq_limit; /* The limit value of the SRQ */ +.in -8 +}; +.fi +.PP +The function +.B ibv_create_srq_ex() +will update the +.I srq_init_attr_ex +struct with the original values of the SRQ that was created; the +values of max_wr and max_sge will be greater than or equal to the +values requested. +.PP +.B ibv_destroy_srq() +destroys the SRQ +.I srq\fR. +.SH "RETURN VALUE" +.B ibv_create_srq_ex() +returns a pointer to the created SRQ, or NULL if the request fails. +.PP +.B ibv_destroy_srq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_destroy_srq() +fails if any queue pair is still associated with this SRQ. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_srq (3), +.BR ibv_query_srq (3) +.SH "AUTHORS" +.TP +Yishai Hadas Index: contrib/ofed/libibverbs/man/ibv_devinfo.1 =================================================================== --- contrib/ofed/libibverbs/man/ibv_devinfo.1 +++ contrib/ofed/libibverbs/man/ibv_devinfo.1 @@ -33,7 +33,7 @@ .SH AUTHORS .TP Dotan Barak -.RI < dotanb@mellanox.co.il > +.RI < dotanba@gmail.com > .TP Roland Dreier .RI < rolandd@cisco.com > Index: contrib/ofed/libibverbs/man/ibv_exp_alloc_mkey_list_memory.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_alloc_mkey_list_memory.3 @@ -0,0 +1,59 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_ALLOC_MKEY_LIST_MOMORY 3 2014-08-28 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_alloc_mkey_list_memory \- allocates a buffer for UMR when using non-inline registration +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_exp_mkey_list_container *ibv_exp_alloc_mkey_list_memory(struct ibv_exp_mkey_list_container_attr " "*attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_alloc_mkey_list_memory() +allocates a buffer for non-inline UMR registration associated with +The protection domain (PD) +.I attr->pd\fR. +and of type +.I attr->mkey_list_type +with attr +.I attr->max_klm_list_size\fR. +maximum number of KLMs used to create UMR +.PP +The argument +.I attr +is an ibv_exp_mkey_list_container_attr struct, as defined in . +.PP +.nf +struct ibv_exp_list_container_attr { +.in +8 +struct ibv_pd *pd; /* protection domain (PD) associated with the UMR */ +uint32_t mkey_list_type; /* use ibv_exp_mkey_list_type */ +uint32_t max_klm_list_size; /* maximum number of MRs we can use to create the UMR */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; + +.fi +.I attr->mkey_list_type +describes type of UMR we want to create; it is one of the following flags: +.PP +.TP +.B IBV_EXP_MKEY_LIST_TYPE_INDIRECT_MR \fR indirect UMR Type +.PP +.SH "RETURN VALUE" +.B ibv_exp_alloc_mkey_list_memory() +returns a pointer to struct ibv_exp_mkey_list_container, or NULL if the request fails. + +.SH "NOTES" +.PP +We need this struct only if we want to Fill the UMR using non-inline post send +.PP +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_exp_dealloc_mkey_list_memory (3), +.BR ibv_post_send (3), +.BR ibv_exp_create_mr (3) +.SH "AUTHORS" +.TP +Haggai Abramovsky Index: contrib/ofed/libibverbs/man/ibv_exp_bind_mw.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_bind_mw.3 @@ -0,0 +1,100 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_BIND_MW 3 2014-04-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_bind_mw \- post a request to bind a type 1 memory window to a memory region +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_bind_mw(struct ibv_exp_mw_bind " "*mw_bind" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_bind_mw() +posts to the queue pair +.I mw_bind->qp +a request to bind the memory window +.I mw_bind->mw +according to the details in +.I mw_bind\fR. +.PP +The argument +.I mw_bind +is an ibv_exp_mw_bind struct, as defined in . +.PP +.nf +struct ibv_exp_mw_bind { +.in +8 +struct ibv_qp *qp; +struct ibv_mw *mw; +uint64_t wr_id; /* User defined WR ID */ +uint64_t exp_send_flags; /* Use ibv_exp_send_flags */ +struct ibv_exp_mw_bind_info bind_info; +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; +.PP +struct ibv_exp_mw_bind_info { +.in +8 +struct ibv_mr *mr; /* The MR to bind the MW to */ +uint64_t addr; /* The address the MW should start at */ +uint64_t length; /* The length (in byte) the MW should span */ +uint64_t exp_mw_access_flags; /* Access flags to the MW. Use ibv_exp_access_flags */ +.in -8 +}; + +.fi +.PP +The QP Transport Service Type must be either UC or RC for bind operations. +.PP +The attribute exp_send_flags describes the properties of the \s-1WR\s0. It is either 0 or the bitwise \s-1OR\s0 of one or more of the following flags: +.PP +.TP +.B IBV_EXP_SEND_FENCE \fR Set the fence indicator. Valid only for QPs with Transport Service Type \fBIBV_QPT_RC +.TP +.B IBV_EXP_SEND_SIGNALED \fR Set the completion notification indicator. Relevant only if QP was created with sq_sig_all=0 +.TP +.B IBV_EXP_SEND_SOLICITED \fR Set the solicited event indicator. Valid only for Send and RDMA Write with immediate +.PP +The exp_mw_access_flags define the allowed access to the MW after the bind +completes successfully. It is either 0 or the bitwise \s-1OR\s0 of one +or more of the following flags: +.TP +.B IBV_EXP_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access. Requires local write access to the MR. +.TP +.B IBV_EXP_ACCESS_REMOTE_READ\fR Enable Remote Read Access +.TP +.B IBV_EXP_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if supported). Requires local write access to the MR. +.TP +.B IBV_EXP_ACCESS_MW_ZERO_BASED\fR If set, the address given in post send is offset from the MW's start address. +.SH "RETURN VALUE" +.B ibv_exp_bind_mw() +returns 0 on success, or the value of errno on failure (which +indicates the failure reason). In case of a success, the R_Key of the +memory window after the bind is returned in the mw_bind->mw->rkey field. +.SH "NOTES" +The bind does not complete when the function return - it is merely +posted to the QP. The user should keep a copy of the old R_Key, and +fix the mw structure if the subsequent CQE for the bind operation +indicates a failure. The user may safely send the R_Key using a send +request on the same QP, but must not transfer it to the remote in any +other manner before reading a successful CQE. +.PP +Note that for type 2 MW, one should directly post bind WQE to the QP, +using ibv_post_send. +.SH "SEE ALSO" +.BR ibv_alloc_mw (3), +.BR ibv_post_send (3), +.BR ibv_poll_cq (3) +.BR ibv_reg_mr (3), +.SH "AUTHORS" +.TP +Haggai Eran +.TP +Shachar Raindel +.TP +Yaniv Saar +.TP +Majd Dibbiny +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_exp_create_cq.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_create_cq.3 @@ -0,0 +1,100 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_CREATE_CQ 3 2014-04-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_create_cq, ibv_destroy_cq \- create or destroy a completion queue (CQ) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_cq *ibv_exp_create_cq(struct ibv_context " "*context" ", int " "cqe" , +.BI " void " "*cq_context" , +.BI " struct ibv_comp_channel " "*channel" , +.BI " int " "comp_vector" , +.BI " struct ibv_exp_cq_init_attr" " *attr"); +.sp +.BI "int ibv_destroy_cq(struct ibv_cq " "*cq" ); +.fi +.SH "DESCRIPTION" +.fi +.B ibv_exp_create_cq() +creates a completion queue (CQ) with at least +.I cqe +entries for the RDMA device context +.I context\fR. +The pointer +.I cq_context +will be used to set user context pointer of the CQ structure. The argument +.I channel +is optional; if not NULL, the completion channel +.I channel +will be used to return completion events. The CQ will use the +completion vector +.I comp_vector +for signaling completion events; it must be at least zero and less than +.I context\fR->num_comp_vectors. + +The argument +.I attr +is an ibv_exp_cq_init_attr struct, as defined in . +.nf +struct ibv_exp_cq_init_attr { +.in +8 +.fi +uint32_t comp_mask; /* Identifies valid fields. Use enum ibv_exp_cq_init_attr_mask */ +.nf +uint32_t flags; /* Flags mask to create CQ with extra features */ +struct ibv_exp_res_domain *res_domain; /* Provides resource domain to indicate the CQ threading and message model */ +.in -8 +}; + +enum ibv_exp_cq_init_attr_mask{ +.in +8 +.fi +IBV_EXP_CQ_INIT_ATTR_FLAGS = 1 << 0, /* Set if attr->flags is valid */ +.nf +IBV_EXP_CQ_INIT_ATTR_RESERVED = 1 << 1, /* Use this - 1 if all fields are valid */ +.in -8 +}; + +.fi +.I attr->flags +specifies the CQ features. It is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_CQ_CREATE_CROSS_CHANNEL \fR Create CQ with Cross Channel +.TP +.B IBV_EXP_CQ_TIMESTAMP \fR Return timestamp in WC +.TP +.B IBV_EXP_CQ_TIMESTAMP_TO_SYS_TIME \fR Convert the timestamp in WC to system time (unsupported by mlx4) + +.PP +.B ibv_destroy_cq() +destroys the CQ +.I cq\fR. +.SH "RETURN VALUE" +.B ibv_exp_create_cq() +returns a pointer to the CQ, or NULL if the request fails. +.PP +.B ibv_destroy_cq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_exp_create_cq() +may create a CQ with size greater than or equal to the requested +size. Check the cqe attribute in the returned CQ for the actual size. +.PP +.B ibv_destroy_cq() +fails if any queue pair is still associated with this CQ. +.SH "SEE ALSO" +.BR ibv_resize_cq (3), +.BR ibv_req_notify_cq (3), +.BR ibv_ack_cq_events (3), +.BR ibv_create_qp (3) +.BR ibv_exp_create_res_domain (3) +.SH "AUTHORS" +.TP +Dotan Barak +.TP +Majd Dibbiny +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_exp_create_dct.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_create_dct.3 @@ -0,0 +1,52 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_CREATE_DCT 3 2013-12-10 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_create_dct \- Create a Dynamically Connected Target (DCT) endpoint. +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_dct *ibv_exp_create_dct(struct ibv_context " "*context," +.BI " struct ibv_exp_dct_init_attr " "*attr" ); +.sp +.fi +.SH "DESCRIPTION" +.B ibv_exp_create_dct() +creates a DC target endpoint associated with the ibv_context +.I context\fR. +The argument +.I attr +is an ibv_exp_dct_init_attr struct, as defined in . +.PP +.nf +struct ibv_exp_dct_init_attr { +.in +8 +struct ibv_pd *pd; /* PD associated with the protection domain */ +struct ibv_cq *cq; /* CQ used to report receive completions */ +struct ibv_srq *srq; /* The SRQ that will provide receive buffers */ +uint64_t dc_key; /* DC access key (64 bit key) */ +uint8_t port; /* Port number */ +uint32_t access_flags; /* Access flags (IBV_ACCESS_REMOTE_READ/WRITE/ATOMIC) */ +uint8_t min_rnr_timer; /* Min rnr NAK time between successive requests of rejected messages */ +uint8_t tclass; /* Traffic class used in packets sent by the DCT in case GRH is used */ +uint32_t flow_label; /* Flow label used in packets sent by the DCT in case GRH is used */ +enum ibv_mtu mtu; /* MTU of the DCT */ +uint8_t pkey_index; /* PKey index */ +uint8_t gid_index; /* Gid index associated with the DCT (to verify incoming packets if GRH is used) */ +uint8_t hop_limit; /* Hop limit used in packets sent by the DCT in case GRH is used */ +uint32_t inline_size; /* The size requested by the user to be inline received */ +uint32_t create_flags; /* Reserved (must be 0) */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; +.fi +.SH "RETURN VALUE" +.B ibv_exp_create_dct() +returns a pointer to the created DCT, or NULL if the request fails. +.SH "AUTHORS" +.TP +Moshe Lazer +.TP +Eli Cohen +.TP Index: contrib/ofed/libibverbs/man/ibv_exp_create_mr.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_create_mr.3 @@ -0,0 +1,123 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_CREATE_MR 3 2014-08-28 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_create_mr \- create empty memory region (MR) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_mr *ibv_exp_create_mr(struct ibv_exp_create_mr_in " "*in" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_create_mr() +create empty memory region (MR) associated with the protection domain +.I in->pd\fR. +The MR's init attributes are +.I in->attr +and the maximum KLMs we can use to fill it is +.I in->attr.max_klm_list_size\fR. +.PP +The argument +.I in +is an ibv_exp_create_mr_in struct, as defined in . +.PP +.nf +struct ibv_exp_create_mr_in { +.in +8 +struct ibv_pd *pd; /* protection domain (PD) associated with the MR */ +struct ibv_exp_mr_init_attr attr; /* Initialization attributes of the memory region (MR) */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; +.PP +.fi +The argument +.I in->attr +is an ibv_exp_mr_init_attr struct, as defined in . +.PP +.nf +struct ibv_exp_mr_init_attr { +.in +8 +uint32_t max_klm_list_size; /* Maximum number of KLMs we can use to fill the MR */ +uint32_t create_flags; /* use ibv_exp_mr_create_flags */ +uint32_t exp_access_flags; /* reserved for future growth (must be 0) */ +.in -8 +}; + +.fi +.I in->attr.exp_access +describes the desired memory protection attributes; it is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_ACCESS_LOCAL_WRITE \fR Enable Local Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_READ\fR Enable Remote Read Access +.TP +.B IBV_EXP_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if supported) +.TP +.B IBV_EXP_ACCESS_MW_BIND\fR Enable Memory Window Binding +.TP +.B IBV_EXP_ACCESS_ALLOCATE_MR\fR Request the low level driver to allocate the memory used for backing the MR. Could improve performance in some cases. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_USER_READ\fR Enable sharing this MR for reading by user (application owner). +.TP +.B IBV_EXP_ACCESS_SHARED_MR_USER_WRITE\fR Enable sharing this MR for writing by user. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_GROUP_READ\fR Enable sharing this MR for reading by group (application group). +.TP +.B IBV_EXP_ACCESS_SHARED_MR_GROUP_WRITE\fR Enable sharing this MR for writing by group. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_OTHER_READ\fR Enable sharing this MR for reading by other. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_OTHER_WRITE\fR Enable sharing this MR for writing by other. +.TP +.B IBV_EXP_ACCESS_ON_DEMAND\fR Create an on-demand paging MR. +.PP +If +.B IBV_EXP_ACCESS_REMOTE_WRITE +or +.B IBV_EXP_ACCESS_REMOTE_ATOMIC +is set, then +.B IBV_EXP_ACCESS_LOCAL_WRITE +must be set too. +.PP +Local read access is always enabled for the MR. +.fi +.I in->attr.create_flags +describes the capabilities of the MR we want to create; it is one of the following flags: +.PP +.TP +.B IBV_EXP_MR_INDIRECT_KLMS\fR Enable capabilities of creating KLM using ibv_exp_post_send() +.PP +.SH "RETURN VALUE" +.B ibv_exp_create_mr() +returns a pointer to an empty MR with lkey, or NULL if the request fails. +The local key (\fBL_Key\fR) field +.B lkey +is used as the lkey field of struct ibv_sge when posting buffers with +ibv_post_* verbs, and the remote key (\fBR_Key\fR) +field +.B rkey +is used by remote processes to perform Atomic and RDMA operations. The remote process places this +.B rkey +as the rkey field of struct ibv_send_wr passed to the ibv_post_send function. + +.SH "NOTES" +.PP +The user can't use this MR untill we FILL the MR using +.I ibv_exp_post_send +with opcode +.I IBV_EXP_WR_UMR_FILL + +.SH "SEE ALSO" +.BR ibv_dereg_mr (3), +.BR ibv_exp_reg_mr (3), +.BR ibv_alloc_pd (3), +.BR ibv_post_send (3), +.BR ibv_post_recv (3), +.SH "AUTHORS" +.TP +Haggai Abramovsky Index: contrib/ofed/libibverbs/man/ibv_exp_create_qp.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_create_qp.3 @@ -0,0 +1,128 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_CREATE_QP 3 2014-24-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_create_qp \- create a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_qp *ibv_exp_create_qp(struct ibv_context " "*context" , +.BI " struct ibv_exp_qp_init_attr " "*qp_init_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_create_qp() +creates a queue pair (QP) associated with the protection domain +.I qp_init_attr->pd\fR. +.fi +The argument +.I qp_init_attr +is an ibv_exp_qp_init_attr struct, as defined in . +.PP +.nf +struct ibv_exp_qp_init_attr { +.in +8 +void *qp_context; /* Associated context of the QP */ +struct ibv_cq *send_cq; /* CQ to be associated with the Send Queue (SQ) */ +struct ibv_cq *recv_cq; /* CQ to be associated with the Receive Queue (RQ) */ +struct ibv_srq *srq; /* SRQ handle if QP is to be associated with an SRQ, otherwise NULL */ +struct ibv_qp_cap cap; /* QP capabilities */ +enum ibv_qp_type qp_type; /* QP Transport Service Type (RC, UD, UC etc.) */ +int sq_sig_all; /* If set, each Work Request (WR) submitted to the SQ generates a completion entry */ +uint32_t comp_mask; /* Identifies valid fields. Use ibv_exp_qp_init_attr_comp_mask */ +struct ibv_pd *pd; /* PD to be associated with the QP */ +struct ibv_xrcd *xrcd; /* XRC domain to be associated with the target QP */ +uint32_t exp_create_flags; /* Creation flags for this QP. Use ibv_exp_qp_create_flags */ +uint32_t max_inl_recv; /* Requested size of inline-receive */ +struct ibv_exp_qpg qpg; /* QP group, used for RSS/TSS */ +uint32_t max_atomic_arg; /* Request max atomic argument size for atomic masked operations */ +uint32_t max_inl_send_klms;/* Request Maximum number of KLMs when creating UMR whith IBV_EXP_SEND_INLINE */ +struct ibv_exp_res_domain *res_domain; /* Provides resource domain to indicate the QP threading and message model */ +struct ibv_exp_rx_hash_conf *rx_hash_conf;/* RX hash configuration used for RSS */ +uint8_t port_num; /* Port number to be associated with, applicable only to RX QP */ +.in -8 +}; +.sp +.nf +struct ibv_qp_cap { +.in +8 +uint32_t max_send_wr; /* Requested max number of outstanding WRs in the SQ */ +uint32_t max_recv_wr; /* Requested max number of outstanding WRs in the RQ */ +uint32_t max_send_sge; /* Requested max number of scatter/gather (s/g) elements in a WR in the SQ */ +uint32_t max_recv_sge; /* Requested max number of s/g elements in a WR in the SQ */ +uint32_t max_inline_data;/* Requested max number of data (bytes) that can be posted inline to the SQ, otherwise 0 */ +.in -8 +}; +.fi +.sp +.nf +The attribute exp_create_flags describes the properties of the QP. It is either 0 or the bitwise OR of one or more of the following flags: +enum ibv_exp_qp_create_flags { +.in +8 +IBV_EXP_QP_CREATE_CROSS_CHANNEL = (1 << 2), /* Create Cross-Channel */ +IBV_EXP_QP_CREATE_MANAGED_SEND = (1 << 3), /* Create managed send queue */ +IBV_EXP_QP_CREATE_MANAGED_RECV = (1 << 4), /* Create managed recv queue */ +IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW = (1 << 6), /* Send queue overflow will be ignored */ +IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW = (1 << 7), /* Recv queue overflow will be ignored */ +IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY = (1 << 8), /* Should be set to use atomics on devices that are ATOMIC_HCA_REPLY_BE */ +IBV_EXP_QP_CREATE_UMR = (1 << 9), /* Create UMR */ +.in -8 +}; +.fi +.sp +.nf +struct ibv_exp_rx_hash_conf { +.in +8 +uint8_t rx_hash_function; /* Use enum ibv_exp_rx_hash_function_flags */ +uint8_t rx_hash_key_len; /* key length - valid only for Toeplitz */ +uint8_t *rx_hash_key; /* key value - valid only for Toeplitz */ +uint64_t rx_hash_fields_mask; /* Use enum ibv_exp_rx_hash_fields to set which incoming packet field should participates in RX hash */ +struct ibv_exp_rwq_ind_table *rwq_ind_tbl; /* Receive work queue indirection table */ +.in -8 +}; +.fi +.PP +The function +.B ibv_exp_create_qp() +will update the +.I qp_init_attr\fB\fR->cap +struct with the actual \s-1QP\s0 values of the QP that was created; +the values will be greater than or equal to the values requested. +It will also update the +.I qp_init_attr\fB\fR->max_inl_recv +in the same way. +.SH "RETURN VALUE" +.B ibv_exp_create_qp() +returns a pointer to the created QP, or NULL if the request fails. +Check the QP number (\fBqp_num\fR) in the returned QP. +.SH "NOTES" +.PP +The attributes max_recv_wr and max_recv_sge are ignored by +.B ibv_exp_create_qp() +if the QP is to be associated with an SRQ. +.PP +If the QP is RX one (i.e. ibv_exp_rx_hash_conf was set) it serves only as a steering entry: +.nf +1) Input CQs,SRQ must be NULL. +.nf +2) Input capabilities relate to send/recv must be 0. +.nf +3) ibv_post_send/recv is not supported on. +.nf +4) qp_type must support RX hash, use ibv_exp_query_device to get rx_hash_caps and evaluate. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_qp (3), +.BR ibv_exp_modify_qp (3), +.BR ibv_exp_query_qp (3), +.BR ibv_query_qp (3) +.BR ibv_exp_create_res_domain (3), +.BR ibv_exp_create_rwq_ind_table (3), +.BR ibv_exp_query_device (3) +.SH "AUTHORS" +.TP +Yishai Hadas +.TP +Majd Dibbiny +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_exp_create_res_domain.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_create_res_domain.3 @@ -0,0 +1,67 @@ +.TH IBV_EXP_CREATE_RES_DOMAIN 3 2015-05-25 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_create_res_domain \- create resource domain +.PP +ibv_exp_destroy_res_domain \- destroy resource domain +.SH "SYNOPSIS" +.nf +.B #include +.BI "static inline struct ibv_exp_res_domain *ibv_exp_create_res_domain(struct ibv_context " "*context", +.BI " struct ibv_exp_res_domain_init_attr " "*attr" ");" + +.BI "static inline int ibv_exp_destroy_res_domain(struct ibv_context " "*context," +.BI " struct ibv_exp_res_domain " "*res_dom," +.BI " struct ibv_exp_destroy_res_domain_attr " "*attr" ");" +.fi +.SH "DESCRIPTION" +.B ibv_exp_create_res_domain() +Creates resource domain which is a verb object that may be associated with QP and a CQ objects on creation to enhance data-path performance. +.PP +The argument +.I attr +is an ibv_exp_res_domain_init_attr struct, as defined in . +.PP +.nf +enum ibv_exp_thread_model { + IBV_EXP_THREAD_SAFE, /* The lib responsible to protect the object in multithreaded environment */ + IBV_EXP_THREAD_UNSAFE, /* The application responsible to protect the object in multithreaded environment */ + IBV_EXP_THREAD_SINGLE /* The object is called from only one thread */ +}; + +enum ibv_exp_msg_model { + IBV_EXP_MSG_DEFAULT, /* Use the provider default message model */ + IBV_EXP_MSG_LOW_LATENCY, /* Hint the provider to optimize for low latency */ + IBV_EXP_MSG_HIGH_BW, /* Hint the provider to optimize for high bandwidth */ + IBV_EXP_MSG_FORCE_LOW_LATENCY, /* Force the provider to optimize for low latency */ +}; + +enum ibv_exp_res_domain_init_attr_comp_mask { + IBV_EXP_RES_DOMAIN_THREAD_MODEL = (1 << 0), + IBV_EXP_RES_DOMAIN_MSG_MODEL = (1 << 1), + IBV_EXP_RES_DOMAIN_RESERVED = (1 << 2), +}; + +struct ibv_exp_res_domain_init_attr { + uint32_t comp_mask; /* use ibv_exp_res_domain_init_attr_comp_mask */ + enum ibv_exp_thread_model thread_model; + enum ibv_exp_msg_model msg_model; +}; + +.PP +.B ibv_exp_destroy_res_domain() +Destroys the resource domain. + +.SH "RETURN VALUE" +.B ibv_exp_create_res_domain() +returns a pointer to the created resource domain, or NULL if the request fails. + +.PP +.B ibv_exp_destroy_res_domain() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). + +.SH "NOTES" + +.SH "SEE ALSO" +.SH "AUTHORS" +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_exp_create_rwq_ind_table.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_create_rwq_ind_table.3 @@ -0,0 +1,59 @@ +.\" -*- nroff -*- +.\" +.TH CREATE_RWQ_IND_TBL 3 2015-08-04 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_create_rwq_ind_table, ibv_exp_destroy_rwq_ind_table \- create or destroy a Receive Work Queue Indirection Table (RWQ IND TBL). +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_exp_rwq_ind_table *ibv_exp_create_rwq_ind_table(struct ibv_context " "*context," +.BI " struct ibv_exp_rwq_ind_table_init_attr " "*init_attr" ); +.sp +.BI "int ibv_exp_destroy_rwq_ind_table(struct ibv_exp_rwq_ind_table " "*rwq_ind_table" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_create_rwq_ind_table() +creates a RWQ IND TBL associated with the ibv_context +.I context\fR. +The argument +.I init_attr +is an ibv_exp_rwq_ind_table_init_attr struct, as defined in . +.PP +.nf +struct ibv_exp_rwq_ind_table_init_attr { +.in +8 +struct ibv_pd *pd; /* PD to be associated with the created object */ +uint32_t log_ind_tbl_size; /* Log, base 2, of Indirection table size */ +struct ibv_exp_wq **ind_tbl; /* Each entry is a pointer to Receive Work Queue */ +uint32_t comp_mask; /* Identifies valid fields. Use ibv_exp_ind_table_init_attr_mask */ +.in -8 +}; +.fi +.PP +The function +.B ibv_exp_create_rwq_ind_table() +will create a RWQ IND TBL that holds a table of Receive Work Queue. +For further usage of the created object see below +.I NOTES\fR. +.PP +.B ibv_exp_destroy_rwq_ind_table() +destroys the RWQ IND TBL +.I rwq_ind_table\fR. +.SH "RETURN VALUE" +.B ibv_exp_create_rwq_ind_table() +returns a pointer to the created RWQ IND TBL, or NULL if the request fails. +.PP +.B ibv_exp_destroy_rwq_ind_table() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The created object should be used as part of +.I ibv_exp_create_qp() +to enable dispatching of incoming packets based on some RX hash configuration. +.SH "SEE ALSO" +.BR ibv_exp_create_wq (3), +.BR ibv_exp_modify_wq (3), +.BR ibv_exp_create_qp (3), +.SH "AUTHORS" +.TP +Yishai Hadas Index: contrib/ofed/libibverbs/man/ibv_exp_create_wq.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_create_wq.3 @@ -0,0 +1,76 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_CREATE_WQ 3 2015-08-04 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_create_wq, ibv_exp_destroy_wq \- create or destroy a Work Queue (WQ). +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_exp_wq *ibv_exp_create_wq(struct ibv_context " "*context," +.BI " struct ibv_exp_wq_init_attr " "*wq_init_attr" ); +.sp +.BI "int ibv_exp_destroy_wq(struct ibv_exp_wq " "*wq" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_create_wq() +creates a WQ associated with the ibv_context +.I context\fR. +The argument +.I wq_init_attr +is an ibv_exp_wq_init_attr struct, as defined in . +.PP +.nf +struct ibv_exp_wq_init_attr { +.in +8 +void *wq_context; /* Associated context of the WQ */ +enum ibv_exp_wq_type wq_type; /* WQ type */ +uint32_t max_recv_wr; /* Requested max number of outstanding WRs in the RQ */ +uint32_t max_recv_sge; /* Requested max receive number of scatter/gather (s/g) elements per WR in the RQ */ +struct ibv_pd *pd; /* PD to be associated with the WQ */ +struct ibv_cq *cq; /* CQ to be associated with the WQ */ +struct ibv_srq *srq; /* SRQ handle if WQ is of type IBV_EXP_WQT_SRQ, otherwise NULL */ +uint32_t comp_mask; /* Identifies valid fields. Use ibv_exp_wq_init_attr_mask */ +struct ibv_exp_res_domain *res_domain; /* Provides resource domain to indicate the WQ threading and message model */ +struct ibv_exp_wq_mp_rq mp_rq; /* Provides multi-packet data for receive WQ */ +.in -8 +}; + +struct ibv_exp_wq_mp_rq { +.in +8 +enum ibv_exp_mp_rq_shifts use_shift; /* Shift to use for received payload */ +uint8_t single_wqe_log_num_of_strides; /* Log of number of strides for single WQE */ +uint8_t single_stride_log_num_of_bytes; /* Log of number of bytes in single stride */ +.in -8 +}; + +.fi +.PP +The function +.B ibv_exp_create_wq() +will update the +.I wq_init_attr\fB\fR->max_recv_wr +and +.I wq_init_attr\fB\fR->max_recv_sge +fields with the actual \s-1WQ\s0 values of the WQ that was created; +the values will be greater than or equal to the values requested. +.PP +.B ibv_exp_destroy_wq() +destroys the WQ +.I wq\fR. +.SH "RETURN VALUE" +.B ibv_exp_create_wq() +returns a pointer to the created WQ, or NULL if the request fails. +.PP +.B ibv_exp_destroy_wq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.PP +The attributes max_recv_wr and max_recv_sge are ignored by +.B ibv_exp_create_wq() +if the WQ is to be associated with an SRQ. +.SH "SEE ALSO" +.BR ibv_exp_modify_wq (3), +.SH "AUTHORS" +.TP +Yishai Hadas Index: contrib/ofed/libibverbs/man/ibv_exp_dealloc_mkey_list_memory.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_dealloc_mkey_list_memory.3 @@ -0,0 +1,38 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_DEALLOC_MKEY_LIST_MOMORY 3 2014-08-28 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_dealloc_mkey_list_memory \- deallocates the struct that was allocated using ibv_exp_alloc_mkey_list_memory() +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_dealloc_mkey_list_memory(struct ibv_exp_mkey_list_container " "*mem" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_dealloc_mkey_list_memory() +dallocates the argument +.I mem + that was allocated using ibv_exp_alloc_mkey_list_memory() +.PP +The argument +.I mem +is an ibv_exp_mkey_list_container struct and is defined in as follow: +.PP +.nf +struct ibv_exp_list_container { +.in +8 +uint32_t max_klm_list_size; /* maximum number of KLMs we can use to create the UMR */ +struct ibv_context *context; /* RDMA device context */ +.in -8 +}; + +.SH "RETURN VALUE" +.B ibv_exp_dealloc_mkey_list_memory() +returns 0 on success, the value of errno on failure (which indicates the failure reason). +.PP +.SH "SEE ALSO" +.BR ibv_exp_alloc_mkey_list_memory (3) +.SH "AUTHORS" +.TP +Haggai Abramovsky Index: contrib/ofed/libibverbs/man/ibv_exp_get_provider_func.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_get_provider_func.3 @@ -0,0 +1,46 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_GET_PROVIDER_FUNC 3 2014-04-23 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_get_provider_func \- gets provider's function pointer +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "void *ibv_exp_get_provider_func(struct ibv_context " "*context" ", enum ibv_exp_func_name " "name"); +.fi +.SH "DESCRIPTION" +.B ibv_exp_get_provider_func() +returns a function's pointer of function +.I name +associated with device context +.I context. +This verb can be used to achieve direct access to provider's data path functions. +.PP +.sp +.fi +.PP +The argument +.I name +is an ibv_exp_func_name enum, as defined in . It should be one of the following values: +.PP +.nf +.TP +.B IBV_EXP_POST_SEND_FUNC \fR Get the exp_post_send's provider's function. +.TP +.B IBV_EXP_POLL_CQ_FUNC \fR Get the exp_poll_cq's provider's function. +.TP +.B IBV_POST_SEND_FUNC \fR Get the post_send's provider's function. +.TP +.B IBV_POLL_CQ_FUNC \fR Get the poll_cq's provider's function. +.TP +.B IBV_POST_RECV_FUNC \fR Get the post_recv's provider's function. + +.SH "RETURN VALUE" +On success, +.B ibv_exp_get_provider_func() +returns a function's pointer, and null on error. +.SH "AUTHORS" +.TP +Majd Dibbiny + Index: contrib/ofed/libibverbs/man/ibv_exp_modify_cq.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_modify_cq.3 @@ -0,0 +1,64 @@ +.\" -*- nroff -*- +.\" +.TH IBV_MODIFY_CQ 3 2014-04-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_modify_cq \- modify attributes of a complition queue (CQ) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_modify_cq(struct ibv_cq " "*cq" , +.BI " struct ibv_exp_cq_attr " "*cq_attr" , +.BI " int " "cq_attr_mask" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_modify_cq() +modifies the attributes of CQ +.I cq +with the attributes in +.I cq_attr +according to the mask +.I cq_attr_mask\fR. +.nf + +The argument \fIcq_attr\fR is an ibv_exp_cq_attr struct, as defined in . +.PP +struct ibv_exp_cq_attr { +.in +8 +uint32_t comp_mask; /* Set to IBV_EXP_CQ_ATTR_RESERVED-1 */ +struct { +.in +8 +uint16_t cq_count; /* Event Generation Moderation counter */ +uint16_t cq_period; /* Event Generation Moderation timer in microseconds */ +.in -8 +} moderation; +uint32_t cq_cap_flags; /* use ibv_exp_cq_cap_flags from verbs_exp.h */ +.in -8 +}; +.fi +.PP +The argument +.I cq_attr_mask +specifies the CQ attributes to be modified. +The argument is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_CQ_MODERATION \fR Set the CQ moderation parameters +.TP +.B IBV_EXP_CQ_CAP_FLAGS \fR Set the CQ capability flags +.SH "RETURN VALUE" +.B ibv_exp_modify_cq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +If any of the modify attributes is invalid, none of the attributes will be modified. +.SH "SEE ALSO" +.BR ibv_exp_create_cq (3), +.BR ibv_destroy_cq (3) + +.SH AUTHORS +.TP +Igor Ivanov +.RI < Igor.Ivanov@itseez.com > +.TP +Majd Dibbiny +.RI < majd@mellanox.com > Index: contrib/ofed/libibverbs/man/ibv_exp_modify_qp.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_modify_qp.3 @@ -0,0 +1,227 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_MODIFY_QP 3 2014-04-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_modify_qp \- modify the attributes of a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_modify_qp(struct ibv_qp " "*qp" ", struct ibv_exp_qp_attr " "*attr" , +.BI " uint64_t " "exp_attr_mask" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_modify_qp() +modifies the attributes of QP +.I qp +with the attributes in +.I attr +according to the mask +.I exp_attr_mask\fR. +The argument \fIattr\fR is an ibv_exp_qp_attr struct, as defined in . +.PP +.nf +struct ibv_exp_qp_attr { +.in +8 +enum ibv_qp_state qp_state; /* Move the QP to this state */ +enum ibv_qp_state cur_qp_state; /* Assume this is the current QP state */ +enum ibv_mtu path_mtu; /* Path MTU (valid only for RC/UC QPs) */ +enum ibv_mig_state path_mig_state; /* Path migration state (valid if HCA supports APM) */ +uint32_t qkey; /* Q_Key for the QP (valid only for UD QPs) */ +uint32_t rq_psn; /* PSN for receive queue (valid only for RC/UC QPs) */ +uint32_t sq_psn; /* PSN for send queue (valid only for RC/UC QPs) */ +uint32_t dest_qp_num; /* Destination QP number (valid only for RC/UC QPs) */ +int qp_access_flags; /* Mask of enabled remote access operations (valid only for RC/UC QPs) */ + /* use ibv_access_flags from verbs.h */ +struct ibv_qp_cap cap; /* QP capabilities (valid if HCA supports QP resizing) */ +struct ibv_ah_attr ah_attr; /* Primary path address vector (valid only for RC/UC QPs) */ +struct ibv_ah_attr alt_ah_attr; /* Alternate path address vector (valid only for RC/UC QPs) */ +uint16_t pkey_index; /* Primary P_Key index */ +uint16_t alt_pkey_index; /* Alternate P_Key index */ +uint8_t en_sqd_async_notify; /* Enable SQD.drained async notification (Valid only if qp_state is SQD) */ +uint8_t sq_draining; /* Is the QP draining? Irrelevant for ibv_modify_qp() */ +uint8_t max_rd_atomic; /* Number of outstanding RDMA reads & atomic operations on the destination QP (valid only for RC QPs) */ +uint8_t max_dest_rd_atomic; /* Number of responder resources for handling incoming RDMA reads & atomic operations (valid only for RC QPs) */ +uint8_t min_rnr_timer; /* Minimum RNR NAK timer (valid only for RC QPs) */ +uint8_t port_num; /* Primary port number */ +uint8_t timeout; /* Local ack timeout for primary path (valid only for RC QPs) */ +uint8_t retry_cnt; /* Retry count (valid only for RC QPs) */ +uint8_t rnr_retry; /* RNR retry (valid only for RC QPs) */ +uint8_t alt_port_num; /* Alternate port number */ +uint8_t alt_timeout; /* Local ack timeout for alternate path (valid only for RC QPs) */ +uint64_t dct_key; /* DC target key */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; +.fi +.PP +For details on struct ibv_qp_cap see the description of +.B ibv_create_qp()\fR. +For details on struct ibv_ah_attr see the description of +.B ibv_create_ah()\fR. +.PP +The argument +.I exp_attr_mask +specifies the QP attributes to be modified. +The argument is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_QP_STATE \fR Modify qp_state +.TP +.B IBV_EXP_QP_CUR_STATE \fR Set cur_qp_state +.TP +.B IBV_EXP_QP_EN_SQD_ASYNC_NOTIFY \fR Set en_sqd_async_notify +.TP +.B IBV_EXP_QP_ACCESS_FLAGS \fR Set qp_access_flags +.TP +.B IBV_EXP_QP_PKEY_INDEX \fR Set pkey_index +.TP +.B IBV_EXP_QP_PORT \fR Set port_num +.TP +.B IBV_EXP_QP_QKEY \fR Set qkey +.TP +.B IBV_EXP_QP_AV \fR Set ah_attr +.TP +.B IBV_EXP_QP_PATH_MTU \fR Set path_mtu +.TP +.B IBV_EXP_QP_TIMEOUT \fR Set timeout +.TP +.B IBV_EXP_QP_RETRY_CNT \fR Set retry_cnt +.TP +.B IBV_EXP_QP_RNR_RETRY \fR Set rnr_retry +.TP +.B IBV_EXP_QP_RQ_PSN \fR Set rq_psn +.TP +.B IBV_EXP_QP_MAX_QP_RD_ATOMIC \fR Set max_rd_atomic +.TP +.B IBV_EXP_QP_ALT_PATH \fR Set the alternative path via: alt_ah_attr, alt_pkey_index, alt_port_num, alt_timeout +.TP +.B IBV_EXP_QP_MIN_RNR_TIMER \fR Set min_rnr_timer +.TP +.B IBV_EXP_QP_SQ_PSN \fR Set sq_psn +.TP +.B IBV_EXP_QP_MAX_DEST_RD_ATOMIC \fR Set max_dest_rd_atomic +.TP +.B IBV_EXP_QP_PATH_MIG_STATE \fR Set path_mig_state +.TP +.B IBV_EXP_QP_CAP \fR Set cap +.TP +.B IBV_EXP_QP_DEST_QPN \fR Set dest_qp_num +.TP +.B IBV_EXP_QP_GROUP_RSS \fR set group RSS +.TP +.B IBV_EXP_QP_DC_KEY \fR set DC key +.SH "RETURN VALUE" +.B ibv_exp_modify_qp() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +If any of the modify attributes or the modify mask are invalid, none +of the attributes will be modified (including the QP state). +.PP +Not all devices support resizing QPs. To check if a device supports it, check if the +.B IBV_DEVICE_RESIZE_MAX_WR +bit is set in the device capabilities flags. +.PP +Not all devices support alternate paths. To check if a device supports it, check if the +.B IBV_DEVICE_AUTO_PATH_MIG +bit is set in the device capabilities flags. +.PP +The following tables indicate for each QP Transport Service Type, the +minimum list of attributes that must be changed upon transitioning QP +state from: Reset \-\-> Init \-\-> RTR \-\-> RTS. +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_UD\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PKEY_INDEX, IBV_EXP_QP_PORT, \fR + \fB IBV_EXP_QP_QKEY \fR +RTR \fB IBV_EXP_QP_STATE \fR +RTS \fB IBV_EXP_QP_STATE, IBV_EXP_QP_SQ_PSN \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_UC\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PKEY_INDEX, IBV_EXP_QP_PORT, \fR + \fB IBV_EXP_QP_ACCESS_FLAGS \fR +RTR \fB IBV_EXP_QP_STATE, IBV_EXP_QP_AV, IBV_EXP_QP_PATH_MTU, \fR + \fB IBV_EXP_QP_DEST_QPN, IBV_EXP_QP_RQ_PSN \fR +RTS \fB IBV_EXP_QP_STATE, IBV_EXP_QP_SQ_PSN \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_RC\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PKEY_INDEX, IBV_EXP_QP_PORT, \fR + \fB IBV_EXP_QP_ACCESS_FLAGS \fR +RTR \fB IBV_EXP_QP_STATE, IBV_EXP_QP_AV, IBV_EXP_QP_PATH_MTU, \fR + \fB IBV_EXP_QP_DEST_QPN, IBV_EXP_QP_RQ_PSN, \fR + \fB IBV_EXP_QP_MAX_DEST_RD_ATOMIC, IBV_EXP_QP_MIN_RNR_TIMER \fR +RTS \fB IBV_EXP_QP_STATE, IBV_EXP_QP_SQ_PSN, IBV_EXP_QP_MAX_QP_RD_ATOMIC, \fR + \fB IBV_EXP_QP_RETRY_CNT, IBV_EXP_QP_RNR_RETRY, IBV_EXP_QP_TIMEOUT \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_RAW_PACKET\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PORT\fR +RTR \fB IBV_EXP_QP_STATE\fR +RTS \fB IBV_EXP_QP_STATE\fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_XRC_RECV\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PKEY_INDEX, IBV_EXP_QP_PORT, \fR + \fB IBV_EXP_QP_ACCESS_FLAGS \fR +RTR \fB IBV_EXP_QP_STATE, IBV_EXP_QP_AV, IBV_EXP_QP_PATH_MTU, \fR + \fB IBV_EXP_QP_DEST_QPN, IBV_EXP_QP_RQ_PSN, \fR + \fB IBV_EXP_QP_MAX_DEST_RD_ATOMIC, IBV_EXP_QP_MIN_RNR_TIMER \fR +RTS \fB IBV_EXP_QP_STATE, IBV_EXP_QP_SQ_PSN, IBV_EXP_QP_TIMEOUT \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_XRC_SEND\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PKEY_INDEX, IBV_EXP_QP_PORT, \fR + \fB IBV_EXP_QP_ACCESS_FLAGS \fR +RTR \fB IBV_EXP_QP_STATE, IBV_EXP_QP_AV, IBV_EXP_QP_PATH_MTU, \fR + \fB IBV_EXP_QP_DEST_QPN, IBV_EXP_QP_RQ_PSN \fR +RTS \fB IBV_EXP_QP_STATE, IBV_EXP_QP_SQ_PSN, IBV_EXP_QP_MAX_QP_RD_ATOMIC, \fR + \fB IBV_EXP_QP_RETRY_CNT, IBV_EXP_QP_RNR_RETRY, IBV_EXP_QP_TIMEOUT \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_EXP_QPT_DC_INI\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PKEY_INDEX, IBV_EXP_QP_PORT, \fR + \fB IBV_EXP_QP_DC_KEY \fR +RTR \fB IBV_EXP_QP_STATE, IBV_EXP_QP_PATH_MTU, IBV_EXP_QP_AV\fR +RTS \fB IBV_EXP_QP_STATE, IBV_EXP_QP_TIMEOUT, IBV_EXP_QP_RETRY_CNT, \fR + \fB IBV_EXP_QP_RNR_RETRY, IBV_EXP_QP_MAX_QP_RD_ATOMIC \fR + +.fi +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_destroy_qp (3), +.BR ibv_query_qp (3), +.BR ibv_create_ah (3) +.SH "AUTHORS" +.TP +Dotan Barak +.TP +Majd Dibbiny Index: contrib/ofed/libibverbs/man/ibv_exp_modify_wq.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_modify_wq.3 @@ -0,0 +1,43 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_MODIFY_WQ 3 2015-08-04 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_modify_wq \- Modify a Work Queue (WQ). +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct int ibv_exp_modify_wq(struct ibv_exp_wq " "*wq," +.BI " struct ibv_exp_wq_attr " "*wq_attr" ); +.sp +.fi +.SH "DESCRIPTION" +.B ibv_exp_modify_wq() +modifys a WQ +.I wq\fR. +The argument +.I wq_attr +is an ibv_exp_wq_attr struct, as defined in . +.PP +.nf +struct ibv_exp_wq_attr { +.in +8 +uint32_t attr_mask; /* Use enum ibv_exp_wq_attr_mask */ +enum ibv_exp_wq_state wq_state; /* Move to this state */ +enum ibv_exp_wq_state curr_wq_state; /* Assume this is the current state */ +.in -8 +}; +.fi +.PP +The function +.B ibv_exp_modify_wq() +will modify the WQ based on the given +.I wq_attr\fB\fR->attr_mask +.SH "RETURN VALUE" +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_exp_create_wq (3), +.BR ibv_exp_destroy_wq (3), +.SH "AUTHORS" +.TP +Yishai Hadas Index: contrib/ofed/libibverbs/man/ibv_exp_poll_cq.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_poll_cq.3 @@ -0,0 +1,105 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_POLL_CQ 3 2014-04-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_poll_cq \- poll a completion queue (CQ) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_poll_cq(struct ibv_cq " "*ibcq" ", int " "num_entries" , +.BI " struct ibv_exp_wc " "*wc" ", uint32_t " "wc_size"); +.fi +.SH "DESCRIPTION" +.B ibv_exp_poll_cq() +polls the CQ +.I ibcq +for work completions and returns the first +.I num_entries +(or all available completions if the CQ contains fewer than this number) in the array +.I wc\fR. +The argument +.I wc +is a pointer to an array of ibv_exp_wc structs, as defined in . +The argument +.I wc_size +is the size of each entry in the wc array. +.PP +.nf +struct ibv_exp_wc { +.in +8 +uint64_t wr_id; /* ID of the completed Work Request (WR) */ +enum ibv_wc_status status; /* Status of the operation */ +enum ibv_exp_wc_opcode exp_opcode; /* Operation type specified in the completed WR */ +uint32_t vendor_err; /* Vendor error syndrome */ +uint32_t byte_len; /* Number of bytes transferred */ +uint32_t imm_data; /* Immediate data (in network byte order) */ +uint32_t qp_num; /* Local QP number of completed WR */ +uint32_t src_qp; /* Source QP number (remote QP number) of completed WR (valid only for UD QPs) */ +int reserved; +uint16_t pkey_index; /* P_Key index (valid only for GSI QPs) */ +uint16_t slid; /* Source LID */ +uint8_t sl; /* Service Level */ +uint8_t dlid_path_bits; /* DLID path bits (not applicable for multicast messages) */ +uint64_t timestamp; /* Timestamp value */ +struct ibv_qp *qp; /* Pointer to the QP object on which the completion was received */ +struct ibv_srq *srq; /* Pointer to the SRQ object on which the completion was received */ +struct ibv_exp_dct *dct; /* Pointer to the DCT object on which the completion was received */ +uint64_t exp_wc_flags; /* Expiremental flags of the completed WR */ +.in -8 +}; +.sp +.fi +.PP +The struct field attribute exp_wc_flags indicates which of the struct fields are valid. +It is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_WC_GRH \fR GRH is present (valid only for UD QPs) +.TP +.B IBV_EXP_WC_WITH_IMM \fR Immediate data value is valid +.TP +.B IBV_EXP_WC_WITH_INV \fR Immediate data value is valid and contains Rkey that was invalidated +.TP +.B IBV_EXP_WC_WITH_SL \fR SL field in WC is valid +.TP +.B IBV_EXP_WC_WITH_SLID \fR SLID field in WC is valid +.TP +.B IBV_EXP_WC_WITH_TIMESTAMP \fR Timestamp value in WC is valid +.TP +.B IBV_EXP_WC_QP \fR QP pointer in WC is valid +.TP +.B IBV_EXP_WC_SRQ \fR SRQ pointer in WC is valid +.TP +.B IBV_EXP_WC_DCT\fR DCT pointer in WC is valid +.PP +Not all +.I wc +attributes are always valid. If the completion status is other than +.B IBV_WC_SUCCESS\fR, +only the following attributes are valid: wr_id, status, qp_num, and vendor_err. +.SH "RETURN VALUE" +On success, +.B ibv_exp_poll_cq() +returns a non-negative value equal to the number of completions +found. On failure, a negative value is returned. +.SH "NOTES" +.PP +Each polled completion is removed from the CQ and cannot be returned to it. +.PP +The user should consume work completions at a rate that prevents CQ +overrun from occurrence. In case of a CQ overrun, the async event +.B IBV_EVENT_CQ_ERR +will be triggered, and the CQ cannot be used. +.SH "SEE ALSO" +.BR ibv_post_send (3), +.BR ibv_post_recv (3), +.BR ibv_exp_get_provider_func (3) +.SH "AUTHORS" +.TP +Dotan Barak +.TP +Moshe Lazer +.TP +Majd Dibbiny + Index: contrib/ofed/libibverbs/man/ibv_exp_post_send.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_post_send.3 @@ -0,0 +1,312 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_POST_SEND 3 2014-04-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_post_send \- post a list of work requests (WRs) to a send queue +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_post_send(struct ibv_qp " "*qp" ", struct ibv_exp_send_wr " "*wr" , +.BI " struct ibv_exp_send_wr " "**bad_wr" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_post_send() +posts the linked list of work requests (WRs) starting with +.I wr +to the send queue of the queue pair +.I qp\fR. +It stops processing WRs from this list at the first failure (that can +be detected immediately while requests are being posted), and returns +this failing WR through +.I bad_wr\fR. +.PP +The argument +.I wr +is an ibv_exp_send_wr struct, as defined in . +.PP +.nf +struct ibv_exp_send_wr { +.in +8 +uint64_t wr_id; /* User defined WR ID */ +struct ibv_exp_send_wr *next; /* Pointer to next WR in list, NULL if last WR */ +struct ibv_sge *sg_list; /* Pointer to the s/g array */ +int num_sge; /* Size of the s/g array */ +enum ibv_exp_wr_opcode exp_opcode; /* Operation type. Use ibv_exp_wr_opcode */ +int reserved; +union { +.in +8 +uint32_t imm_data; /* Immediate data (in network byte order) */ +uint32_t invalidate_rkey; /* The R_Key to invalidate */ +.in -8 +} ex; +union { +.in +8 +struct { +.in +8 +uint64_t remote_addr; /* Start address of remote memory buffer */ +uint32_t rkey; /* Key of the remote Memory Region */ +.in -8 +} rdma; +struct { +.in +8 +uint64_t remote_addr; /* Start address of remote memory buffer */ +uint64_t compare_add; /* Compare operand */ +uint64_t swap; /* Swap operand */ +uint32_t rkey; /* Key of the remote Memory Region */ +.in -8 +} atomic; +struct { +.in +8 +struct ibv_ah *ah; /* Address handle (AH) for the remote node address */ +uint32_t remote_qpn; /* QP number of the destination QP */ +uint32_t remote_qkey; /* Q_Key number of the destination QP */ +.in -8 +} ud; +.in -8 +} wr; +union { +.in +8 +union { +.in +8 +struct { +.in +8 +uint32_t remote_srqn; /* SRQ number of the destination SRQ */ +.in -8 +} xrc; +.in -8 +} qp_type; +uint32_t xrc_remote_srq_num; /* SRQ number of the destination SRQ */ +.in -8 +}; +union { +.in +8 +struct { +.in +8 +uint64_t remote_addr; +uint32_t rkey; +.in -8 +} rdma; +struct { +.in +8 +uint64_t remote_addr; +uint64_t compare_add; +uint64_t swap; +uint32_t rkey; +.in -8 +} atomic; +struct { +.in +8 +struct ibv_cq *cq; /* Completion queue (CQ) that WAIT WR relates to */ +int32_t cq_count; /* Producer index (PI) of the CQ */ +.in -8 +} cqe_wait; /* Describes IBV_EXP_WR_CQE_WAIT WR */ +struct { +.in +8 +struct ibv_qp *qp; /* Queue pair (QP) that SEND_EN/RECV_EN WR relates to */ +int32_t wqe_count; /* Producer index (PI) of the QP */ +.in -8 +} wqe_enable; /* Desribes IBV_EXP_WR_RECV_ENABLE and IBV_EXP_WR_SEND_ENABLE WR */ +.in -8 +} task; +union { +.in +8 +struct { +.in +8 +enum ibv_exp_calc_op calc_op; +enum ibv_exp_calc_data_type data_type; /* Supported types of data */ +enum ibv_exp_calc_data_size data_size; /* Accept data with size */ +.in -8 +} calc; +.in -8 +} op; +struct { +.in +8 +struct ibv_ah *ah; +uint64_t dct_access_key; +uint32_t dct_number; +.in -8 +} dc; +struct { +.in +8 +struct ibv_mw *mw; /* The MW to bind */ +uint32_t rkey; /* The new R_Key to assign to the MW */ +struct ibv_exp_mw_bind_info bind_info; /* The rest of the bind information - the MR to bind to, + the requested access, the address to bind + and the length to bind */ +.in -8 +} bind_mw; +uint64_t exp_send_flags; /* use ibv_exp_send_flags */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +union { +.in +8 +struct { +.in +8 +uint32_t umr_type; /* use ibv_exp_umr_wr_type */ +struct ibv_exp_mkey_list_container *memory_objects; /* used when IBV_EXP_SEND_INLINE is not set */ +uint64_t exp_access; /* use ibv_exp_access_flags */ +struct ibv_mr *modified_mr; +uint64_t base_addr; +uint32_t num_mrs; /* array size of mem_repeat_block_list or mem_reg_list */ +union { +.in +8 +struct ibv_exp_mem_region *mem_reg_list; /* array, size corresponds to num_mrs */ +struct { +.in +8 +struct ibv_exp_mem_repeat_block *mem_repeat_block_list; /* array, size corresponds to num_mr */ +size_t *repeat_count; /* array size corresponds to stride_dim */ +uint32_t stride_dim; +.in -8 +} rb; +.in -8 +} mem_list; +.in -8 +} umr; +struct { +.in +8 +uint32_t log_arg_sz; +uint64_t remote_addr; +uint32_t rkey; +union { +.in +8 +struct { +.in +8 +/* For the next four fields: +* If operand_size <= 8 then inline data is immediate +* from the corresponding field; for small opernands, +* ls bits are used. +* Else the fields are pointers in the process's address space +* where arguments are stored +*/ +union { +.in +8 +struct ibv_exp_cmp_swap cmp_swap; +struct ibv_exp_fetch_add fetch_add; +.in -8 +} op; +.in -8 +} inline_data; /* IBV_EXP_SEND_EXT_ATOMIC_INLINE is set */ +/* in the future add support for non-inline argument provisioning */ +.in -8 +} wr_data; +.in -8 +} masked_atomics; +.in -8 +} ext_op; +.in -8 +}; + +.sp +.nf +struct ibv_sge { +.in +8 +uint64_t addr; /* Start address of the local memory buffer */ +uint32_t length; /* Length of the buffer */ +uint32_t lkey; /* Key of the local Memory Region */ +.in -8 +}; +.sp +.nf +struct ibv_exp_cmp_swap { +.in +8 +uint64_t compare_mask; /* which bits to compare on CMP&SWP masked atomic operation */ +uint64_t compare_val; /* compare value */ +uint64_t swap_val; /* swap value */ +uint64_t swap_mask; /* which bits to swap on CMP&SWP masked atomic operation */ +.in -8 +}; +.sp +.nf +struct ibv_exp_fetch_add { +.in +8 +uint64_t add_val; /* value to add */ +uint64_t field_boundary; /* the boundry of the add operation on F&A masked atomic operation */ +.in -8 +}; +.fi +.PP +Each QP Transport Service Type supports a specific set of opcodes, as shown in the following table: +.PP +.nf +OPCODE | IBV_QPT_UD | IBV_QPT_UC | IBV_QPT_RC | IBV_EXP_QPT_DC_INI | IBV_QPT_XRC | +\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +IBV_EXP_WR_SEND | X | X | X | X | X | +IBV_EXP_WR_SEND_WITH_IMM | X | X | X | X | X | +IBV_EXP_WR_RDMA_WRITE | | X | X | X | X | +IBV_EXP_WR_RDMA_WRITE_WITH_IMM | | X | X | X | X | +IBV_EXP_WR_SEND_WITH_INV | | X | X | X | X | +IBV_EXP_WR_LOCAL_INV | | X | X | X | X | +IBV_EXP_WR_BIND_MW | | X | X | X | X | +IBV_EXP_WR_RDMA_READ | | X | X | X | X | +IBV_EXP_WR_ATOMIC_CMP_AND_SWP | | X | X | X | X | +IBV_EXP_WR_ATOMIC_FETCH_AND_ADD | | X | X | X | X | +IBV_EXP_WR_SEND_ENABLE | | X | X | X | X | +IBV_EXP_WR_RECV_ENABLE | | X | X | X | X | +IBV_EXP_WR_CQE_WAIT | | X | X | X | X | +IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP | | | X | X | X | +IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD | | | X | X | X | +IBV_EXP_WR_NOP | | | X | X | X | +IBV_EXP_WR_UMR_FILL | | | X | X | X | +IBV_EXP_WR_UMR_INVALIDATE | | | X | X | X | +.fi +.PP +The attribute exp_send_flags describes the properties of the \s-1WR\s0. It is either 0 or the bitwise \s-1OR\s0 of one or more of the following flags: +.PP +.TP +.B IBV_EXP_SEND_FENCE \fR Set the fence indicator. Valid only for QPs with Transport Service Type \fBIBV_QPT_RC +.TP +.B IBV_EXP_SEND_SIGNALED \fR Set the completion notification indicator. Relevant only if QP was created with sq_sig_all=0 +.TP +.B IBV_EXP_SEND_SOLICITED \fR Set the solicited event indicator. Valid only for Send and RDMA Write with immediate +.TP +.B IBV_EXP_SEND_INLINE \fR Send data in given gather list as inline data +in a send WQE. Valid only for Send and RDMA Write. The L_Key will not be checked. +.TP +.B IBV_EXP_SEND_IP_CSUM \fR Request for checksum +.TP +.B IBV_EXP_SEND_WITH_CALC \fR Set to take in multiple buffers, perform a data reduction on them, and send the result. +.TP +.B IBV_EXP_SEND_WAIT_EN_LAST \fR Set to the last WR when using IBV_EXP_WR_CQE_WAIT. +.TP +.B IBV_EXP_SEND_EXT_ATOMIC_INLINE \fR Set to use extended atomic operations. Data will be inline in the send WQE. +.SH "RETURN VALUE" +.B ibv_exp_post_send() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The user should not alter or destroy AHs associated with WRs until +request is fully executed and a work completion has been retrieved +from the corresponding completion queue (CQ) to avoid unexpected +behavior. +.PP +The buffers used by a WR can only be safely reused after WR the +request is fully executed and a work completion has been retrieved +from the corresponding completion queue (CQ). However, if the +IBV_SEND_INLINE flag was set, the buffer can be reused immediately +after the call returns. +.PP +When performing invalidation actions (send with invalidate or local +invalidate), the R_Key for the invalidation is taken from the imm_data +field. +.PP +When posting WR to invalidate memory window R_Key the value of ex.invalidate_rkey +should contain the R_Key to invalidate. +.PP +When posting WR to send data to a remote host the value of ex.imm_data +should contain data to send. +.PP +The table above, is hardware dependant. Some opcodes might be supported for +certain QP types on one hardware, and not on another. + +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_create_ah (3), +.BR ibv_post_recv (3), +.BR ibv_post_srq_recv (3), +.BR ibv_poll_cq (3) +.SH "AUTHORS" +.TP +Dotan Barak +.TP +Majd Dibbiny +.TP +Haggai Abramovsky Index: contrib/ofed/libibverbs/man/ibv_exp_post_task.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_post_task.3 @@ -0,0 +1,65 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_POST_TASK 3 2014-04-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_post_task \- post a list of send/recv tasks (TAKSs) to QPs +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_post_task(struct ibv_context " "*context" ", +.BI " struct ibv_exp_task " "*task" , +.BI " struct ibv_exp_task " "**bad_task" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_post_task() +posts the linked list of tasks (TASKs) starting with +.I task. +It stops processing TASKs from this list at the first failure (that can +be detected immediately while requests are being posted), and returns +this failing TASK through +.I bad_task\fR. +Every TASK consists of the linked list of work requests (WRs) is being +posted to queue pair (QP). The task list may have entries posted to +multiple QPs and includes send and receive communication primitives, +as well as the communication coordination primitives wait, send_enable +and receive_enable. +There are two types of TASK as send/recv. +.PP +The argument +.I task +is an ibv_exp_task struct, as defined in . +.PP +.nf +struct ibv_exp_task { +.in +8 +enum ibv_exp_task_type task_type; /* Task type: IBV_EXP_TASK_SEND or IBV_EXP_TASK_SEND */ +struct { +.in +8 +struct ibv_qp *qp; /* Addressed QP */ +union { +.in +8 +struct ibv_exp_send_wr *send_wr; /* Pointer to next WR in list, NULL if last WR */ +struct ibv_recv_wr *recv_wr; /* Pointer to next WR in list, NULL if last WR */ +.in -8 +}; +.in -8 +} item; +.in -8 +struct ibv_exp_task *next; /* Pointer to next TASK in list, NULL if last TASK */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +}; +.sp +.fi +.PP +.SH "RETURN VALUE" +.B ibv_exp_post_task() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). + +.SH AUTHORS +.TP +Igor Ivanov +.RI < Igor.Ivanov@itseez.com > +.TP +Majd Dibbiny +.RI < Majd@mellanox.com > Index: contrib/ofed/libibverbs/man/ibv_exp_prefetch_mr.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_prefetch_mr.3 @@ -0,0 +1,67 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_PREFETCH_MR 3 2013-08-06 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_prefetch_mr \- prefetch pages of an on-demand paging memory region +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int " "ibv_exp_prefetch_mr"( +.BI " struct ibv_mr " "*mr" ", struct ibv_exp_prefetch_attr " "*attr"); +.fi +.SH "DESCRIPTION" +.B ibv_exp_prefetch_mr() +requests the driver to prefetch a given range of pages and map them for access from the HCA. +The function is applicable to MRs that were registered as ODP, i.e. with the +experimental access flag +.B IBV_EXP_ACCESS_ON_DEMAND. +.PP +.I mr +- the memory region containing the area that the driver is requested to prefetch. +.PP +.I attr +- a structure containing the input arguments to the function. It is defined as: +.PP +.nf +struct ibv_exp_prefetch_attr { +.in +8 + uint32_t flags; /* Use enum ibv_exp_prefetch_flags in verbs_exp.h */ + void *addr; /* Address of the area to prefetch */ + size_t length; /* Length of the area to prefetch */ + uint32_t comp_mask; /* Use enum ibv_exp_prefetch_attr_comp_mask in verbs_exp.h */ +.in -8 +}; +.fi +.PP +.PP +.SH "RETURN VALUE" +.B ibv_exp_prefetch_mr() +returns 0 when the call was successful. Otherwise returns an error code: +.PP +.in +2 +.B ENOSYS +libibverbs or provider driver doesn't support the prefetching verb. +.PP +.in +2 +.B EFAULT +when the range requested is out of the memory region bounds, or when +parts of it are not part of the process address space. +.PP +.in +2 +.B EINVAL +when the MR is invalid. +.SH "NOTES" +This function is considered as a hint to the driver. The actual prefetch is +done on 'best effort' policy, meaning that the driver may decide to ignore +the fetch. In such case the call is considered successful, although no prefetch +was done. +.PP +This function is an experimental verbs extensions supported only by the +Mellanox OFED driver collection. It will be deprecated in the future once +a stable API is added to the upstream version of libibverbs. +.SH "SEE ALSO" +.BR ibv_exp_reg_mr (3) +.SH "AUTHORS" +.TP +Haggai Eran Index: contrib/ofed/libibverbs/man/ibv_exp_query_dct.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_query_dct.3 @@ -0,0 +1,62 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_QUERY_DCT 3 2014-08-28 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_query_dct \- query DCT's attributes. +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_query_dct(struct ibv_exp_dct " "*dct" ", struct ibv_exp_dct_attr " "*attr"); +.fi +.SH "DESCRIPTION" +.B ibv_exp_query_dct() +gets the attributes of +.I dct\fR +into the argument +.I attr. +.I attr +is an ibv_exp_dct_attr struct, as defined in . +.PP +.nf +struct ibv_exp_dct_attr { +.in +8 +uint64_t dc_key; /* DC access key (64 bit key) */ +uint8_t port; /* Port number */ +uint32_t access_flags; /* use ibv_access_flags form verbs.h */ +uint8_t min_rnr_timer; /* Min rnr NAK time between successive requests of rejected messages */ +uint8_t tclass; /* Traffic class used in packets sent by the DCT in case GRH is used */ +uint32_t flow_label; /* Flow label used in packets sent by the DCT in case GRH is used */ +enum ibv_mtu mtu; /* MTU of the DCT */ +uint8_t pkey_index; /* PKey index */ +uint8_t gid_index; /* Gid index associated with the DCT (to verify incoming packets if GRH is used) */ +uint8_t hop_limit; /* Hop limit used in packets sent by the DCT in case GRH is used */ +uint32_t key_violations; /* DC access key violation counter */ +uint8_t state; /* DCT state (IBV_EXP_DCT_STATE_ACTIVE, IBV_EXP_DCT_STATE_DRAINING, IBV_EXP_DCT_STATE_DRAINED) */ +struct ibv_srq *srq; /* The SRQ that will provide receive buffers */ +struct ibv_cq *cq; /* CQ used to report receive completions */ +struct ibv_pd *pd; /* PD associated with the protection domain */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; +.fi +.PP +The function +.B ibv_exp_query_dct() +will update the +.I attr +struct with the actual \s-1dct\s0 values of the DCT that was queried. +.PP +.SH "RETURN VALUE" +.B ibv_exp_query_dct() +returns 0 on success, or the value of errno on failure. + +.SH "NOTES" +.PP +.SH "SEE ALSO" +.BR ibv_exp_create_qp (3), +.BR ibv_exp_create_dct (3) +.BR ibv_exp_query_qp (3) +.SH "AUTHORS" +.TP +Haggai Abramovsky Index: contrib/ofed/libibverbs/man/ibv_exp_query_device.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_query_device.3 @@ -0,0 +1,157 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_QUERY_DEVICE 3 2014-04-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_query_device \- query an RDMA device's attributes +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_query_device(struct ibv_context " "*context", +.BI " struct ibv_exp_device_attr " "*attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_query_device() +returns the attributes of the device with context +.I context\fR. +The argument +.I attr +is a pointer to an ibv_exp_device_attr struct, as defined in . +.PP +.nf +struct ibv_exp_device_attr { +.in +8 +char fw_ver[64]; /* FW version */ +uint64_t node_guid; /* Node GUID (in network byte order) */ +uint64_t sys_image_guid; /* System image GUID (in network byte order) */ +uint64_t max_mr_size; /* Largest contiguous block that can be registered */ +uint64_t page_size_cap; /* Supported memory shift sizes */ +uint32_t vendor_id; /* Vendor ID, per IEEE */ +uint32_t vendor_part_id; /* Vendor supplied part ID */ +uint32_t hw_ver; /* Hardware version */ +int max_qp; /* Maximum number of supported QPs */ +int max_qp_wr; /* Maximum number of outstanding WR on any work queue */ +int reserved; /* Place holder to align with ibv_device_attr */ +int max_sge; /* Maximum number of s/g per WR for non-RD QPs */ +int max_sge_rd; /* Maximum number of s/g per WR for RD QPs */ +int max_cq; /* Maximum number of supported CQs */ +int max_cqe; /* Maximum number of CQE capacity per CQ */ +int max_mr; /* Maximum number of supported MRs */ +int max_pd; /* Maximum number of supported PDs */ +int max_qp_rd_atom; /* Maximum number of RDMA Read & Atomic operations that can be outstanding per QP */ +int max_ee_rd_atom; /* Maximum number of RDMA Read & Atomic operations that can be outstanding per EEC */ +int max_res_rd_atom; /* Maximum number of resources used for RDMA Read & Atomic operations by this HCA as the Target */ +int max_qp_init_rd_atom; /* Maximum depth per QP for initiation of RDMA Read & Atomic operations */ +int max_ee_init_rd_atom; /* Maximum depth per EEC for initiation of RDMA Read & Atomic operations */ +enum ibv_exp_atomic_cap exp_atomic_cap; /* Atomic operations support level */ +int max_ee; /* Maximum number of supported EE contexts */ +int max_rdd; /* Maximum number of supported RD domains */ +int max_mw; /* Maximum number of supported MWs */ +int max_raw_ipv6_qp; /* Maximum number of supported raw IPv6 datagram QPs */ +int max_raw_ethy_qp; /* Maximum number of supported Ethertype datagram QPs */ +int max_mcast_grp; /* Maximum number of supported multicast groups */ +int max_mcast_qp_attach; /* Maximum number of QPs per multicast group which can be attached */ +int max_total_mcast_qp_attach;/* Maximum number of QPs which can be attached to multicast groups */ +int max_ah; /* Maximum number of supported address handles */ +int max_fmr; /* Maximum number of supported FMRs */ +int max_map_per_fmr; /* Maximum number of (re)maps per FMR before an unmap operation in required */ +int max_srq; /* Maximum number of supported SRQs */ +int max_srq_wr; /* Maximum number of WRs per SRQ */ +int max_srq_sge; /* Maximum number of s/g per SRQ */ +uint16_t max_pkeys; /* Maximum number of partitions */ +uint8_t local_ca_ack_delay; /* Local CA ack delay */ +uint8_t phys_port_cnt; /* Number of physical ports */ +uint32_t comp_mask; /* Compatibility mask that defines which struct members of ibv_exp_device_attr exist and valid */ + /* Use enum ibv_exp_device_attr_comp_mask */ +struct ibv_exp_device_calc_cap calc_cap; /* Capabilities of the device to do CALC operations */ +uint64_t timestamp_mask; /* The valid timestamp mask. This field indicates the number of bits the device supports for timestamping */ +uint64_t hca_core_clock; /* HCA core frequency*/ +uint64_t exp_device_cap_flags; /* HCA experimental capabilities mask*/ +int max_dc_req_rd_atom; /* Max number of outstanding rmda/atomic requests that can be issued by the requestor */ +int max_dc_res_rd_atom; /* Max number of outstanding rmda/atomic requests that can be issued by the responder */ +int inline_recv_sz; /* Inline receive size*/ +uint32_t max_rss_tbl_sz; /* Max RSS table size*/ +struct ibv_exp_ext_atomics_params ext_atom; /* Extended Atomics params */ +struct ibv_exp_umr_caps umr_caps; /* UMR capabilities */ +struct ibv_exp_odp_caps odp_caps; /* On-Demand Paging capabilities */ +int max_dct; /* Max DC targets */ +int max_ctx_res_domain; /* Max context resource domain */ +struct ibv_exp_rx_hash_caps rx_hash_caps; /* RX hash capabilities */ +uint32_t max_wq_type_rq; /* Max Work Queue from type RQ */ +int max_device_ctx; /* Maximum device contexts */ +struct ibv_exp_mp_rq_caps mp_rq_caps; /* Multi-Packet Receive Queue(RQ) capabilities */ +.in -8 +}; + +struct ibv_exp_ext_atomics_params { +.in +8 +uint64_t log_atomic_arg_sizes; /* bit-mask of supported sizes */ +uint32_t max_fa_bit_boundary; /* Max fech and add bit boundary */ +uint32_t log_max_atomic_inline; /* log of the maximum atomic inline size */ +.in -8 +}; + +struct ibv_exp_umr_caps { +.in +8 +uint32_t max_klm_list_size; /* maximum number of klms that can be used to create umr */ +uint32_t max_send_wqe_inline_klms; /* maximum number of klms that can be used to create UMR using inline */ +uint32_t max_umr_recursion_depth; /* maximum recursion depth we can use to create klm using other klms */ +uint32_t max_umr_stride_dimension; /* maximum dimension of the klm */ +.in -8 +}; + +struct ibv_exp_odp_caps { + uint64_t general_odp_caps; /* Mask with enum ibv_odp_general_cap_bits */ + struct { + uint32_t rc_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + uint32_t uc_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + uint32_t ud_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + uint32_t dc_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + uint32_t xrc_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + uint32_t raw_eth_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + } per_transport_caps; +}; + +struct ibv_exp_rx_hash_caps { +.in +8 +uint32_t max_rwq_indirection_tables; /* Max number of receive work queue indirection tables */ +uint32_t max_rwq_indirection_table_size; /* Max size of receive work queue indirection table */ +uint8_t supported_hash_functions; /* Mask with enum ibv_exp_rx_hash_function_flags to know which hash functions are supported */ +uint64_t supported_packet_fields; /* Mask with enum ibv_exp_rx_hash_fields to know which packet fields are supported */ +uint32_t supported_qps; /* Mask with enum ibv_exp_supported_qp_types to know which QP types support RX hash */ +.in -8 +}; + +struct ibv_exp_mp_rq_caps { +.in +8 +uint32_t supported_qps; /* Mask with enum ibv_exp_supported_qp_types to know which QP types support MP RQ */ +uint32_t allowed_shifts; /* Mask with enum ibv_exp_mp_rq_shifts to know which payload shifts are supported */ +uint8_t min_single_wqe_log_num_of_strides; /* Log of minimum number of strides for single WQE */ +uint8_t max_single_wqe_log_num_of_strides; /* Log of maximum number of strides for single WQE */ +uint8_t min_single_stride_log_num_of_bytes; /* Log of minimum number of bytes in single stride */ +uint8_t max_single_stride_log_num_of_bytes; /* Log of maximum number of bytes in single stride */ +.in -8 +}; + + +.fi +.SH "RETURN VALUE" +.B ibv_exp_query_device() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The maximum values returned by this function are the upper limits of +supported resources by the device. However, it may not be possible to +use these maximum values, since the actual number of any resource that +can be created may be limited by the machine configuration, the amount +of host memory, user permissions, and the amount of resources already +in use by other users/processes. +.SH "SEE ALSO" +.BR ibv_open_device (3), +.BR ibv_query_port (3), +.BR ibv_query_pkey (3), +.BR ibv_query_gid (3) +.SH "AUTHORS" +.TP +Majd Dibbiny +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_exp_query_gid_attr.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_query_gid_attr.3 @@ -0,0 +1,47 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_QUERY_GID_ATTR 3 2015-08-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_query_gid_attr \- query a GID attributes +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_query_gid_attr(struct ibv_context " "*context" ", uint8_t " "port_num" , +.BI " unsigned int " "index" ", struct ibv_exp_gid_attr " "*gid_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_query_gid_attr() +returns the GID attributes in entry +.I index +of port +.I port_num +for device context +.I context +through the pointer +.I gid_attr\fR. +.PP +The argument +.I gid_attr +is an ibv_exp_gid_attr struct, as defined in . +.PP +.nf +struct ibv_exp_gid_attr { +.in +8 +uint32_t comp_mask; /* Use ibv_exp_query_gid_attr */ +enum ibv_exp_roce_gid_type type; /* The GID type */ +union ibv_gid gid; /* The GID in the desired index of the specified port */ +.in -8 +}; + +comp_mask is used as an input to the verb, to choose which of the fields should +be queried, and as an output to indicate which fields were queried. + +.SH "RETURN VALUE" +.B ibv_exp_query_gid_attr() +returns 0 on success, and errno on failure. +.SH "SEE ALSO" +.BR ibv_query_gid (3), +.SH "AUTHORS" +.TP +Majd Dibbiny Index: contrib/ofed/libibverbs/man/ibv_exp_query_intf.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_query_intf.3 @@ -0,0 +1,293 @@ +.TH IBV_EXP_QUERY_INTF 3 2015-05-25 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_query_intf \- query for family of verbs interface for specific QP/WQ/CQ +.PP +ibv_exp_release_intf \- release the queried interface +.SH "SYNOPSIS" +.nf +.B #include +.BI "static inline void *ibv_exp_query_intf(struct ibv_context " "*context" "," +.BI " struct ibv_exp_query_intf_params " "*params" "," +.BI " enum ibv_exp_query_intf_status " "*status" ");" +.BI "static inline int ibv_exp_release_intf(struct ibv_context " "*context" ", void " "*intf" "," +.BI " struct ibv_exp_release_intf_params " "*params" ");" +.fi +.SH "DESCRIPTION" +.B ibv_exp_query_intf() +Provides mechanism to extend the verbs with families of verbs interfaces. +These extensions provide a way to optimize data-path interfaces (e.g. post-send/recv, poll-cq) +for specific kind of applications (e.g. DPDK). +.PP +The argument +.I params +is an ibv_exp_query_intf_params struct, as defined in . +.PP +.nf +enum ibv_exp_query_intf_flags { + /* Interface functions includes correctness and validity checks */ + IBV_EXP_QUERY_INTF_FLAG_ENABLE_CHECKS = (1 << 0), +}; + +enum ibv_exp_intf_family { + IBV_EXP_INTF_QP_BURST, + IBV_EXP_INTF_CQ, + IBV_EXP_INTF_WQ, + IBV_EXP_INTF_RESERVED, +}; + +enum ibv_exp_intf_scope { + IBV_EXP_INTF_GLOBAL, /* Permanent interface, identified by + * the ibv_exp_intf_family enum + */ + IBV_EXP_INTF_EXPERIMENTAL, /* Interface under evaluation, identified by + * the ibv_exp_experimental_intf_family enum + * This interface may change between lib + * versions + */ + IBV_EXP_INTF_VENDOR, /* Vendor specific interface, defined in vendor + * separate header file + */ + IBV_EXP_INTF_VENDOR_EXPERIMENTAL, /* Vendor interface under evaluation, + * defined in vendor separate header + * file + */ +}; + +struct ibv_exp_query_intf_params { + uint32_t flags; /* use ibv_exp_query_intf_flags */ + enum ibv_exp_intf_scope intf_scope; + uint64_t vendor_guid; /* set in case VENDOR intf_scope selected */ + uint32_t intf; /* for GLOBAL intf_scope use ibv_exp_intf_family enum */ + uint32_t intf_version; /* Version */ + void *obj; /* interface object (CQ/QP/WQ) */ + void *family_params; /* Family-specific params */ + uint32_t family_flags; /* Family-specific flags */ + uint32_t comp_mask; /* use ibv_exp_query_intf_comp_mask */ +}; +.fi +.PP +.B ibv_exp_release_intf() +release the queried interface previously obtained by +.B ibv_exp_query_intf() + +.SH "RETURN VALUE" +.B ibv_exp_query_intf() +.PP +On failure the function returns NULL and the +.I status +contains the failure reason as defined by the ibv_exp_query_intf_status enum: +.PP +.nf +/* Return status from ibv_exp_query_intf */ +enum ibv_exp_query_intf_status { + IBV_EXP_INTF_STAT_OK, + IBV_EXP_INTF_STAT_VENDOR_NOT_SUPPORTED, /* The provided 'vendor_guid' is not supported */ + IBV_EXP_INTF_STAT_INTF_NOT_SUPPORTED, /* The provided 'intf' is not supported */ + IBV_EXP_INTF_STAT_VERSION_NOT_SUPPORTED, /* The provided 'intf_version' is not supported */ + IBV_EXP_INTF_STAT_INVAL_PARARM, /* General invalid parameter */ + IBV_EXP_INTF_STAT_INVAL_OBJ_STATE, /* QP is not in INIT, RTR or RTS state */ + IBV_EXP_INTF_STAT_INVAL_OBJ, /* Mismatch between the provided 'obj'(CQ/QP/WQ) and requested 'intf' */ + IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED, /* The provided set of 'flags' is not supported */ + IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED, /* The provided set of 'family_flags' is not supported */ +}; +.fi +.PP +On success the function returns pointer to the requested interface family. +The familes currently supported are: +.PP +.B QP-burst family: +.PP +.nf +/* Flags to use in family_flags field of ibv_exp_query_intf_params on family creation */ +enum ibv_exp_qp_burst_family_create_flags { + /* To disable loop-back of multi-cast messages in RAW-ETH */ + IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK = (1 << 0), + /* To enable Multi-Packet send WR when possible */ + IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR = (1 << 1), +}; + +/* Flags to use on send functions of QP burst family */ +enum ibv_exp_qp_burst_family_flags { + IBV_EXP_QP_BURST_SIGNALED = 1 << 0, + IBV_EXP_QP_BURST_SOLICITED = 1 << 1, + IBV_EXP_QP_BURST_IP_CSUM = 1 << 2, + IBV_EXP_QP_BURST_TUNNEL = 1 << 3, + IBV_EXP_QP_BURST_FENCE = 1 << 4, +}; + +struct ibv_exp_qp_burst_family { + /* + * send_pending - Put one message in the provider send queue. + * + * Common usage: After calling several times to send_pending + * the application need to call send_flush to ensure the send + * of the pending messages. + * Note: Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_pending)(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags); + /* + * send_pending_inline - Put one inline message in the provider send queue. + * + * Common usage: Same as send_pending + * Notes: + * - The message length must fit the max inline size of the QP. + * Providing bigger messages may lead to data corruption and + * segmentation fault. + * - Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_pending_inline)(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags); + /* + * send_pending_sg_list - Put one scatter-gather(sg) message in the provider send queue. + * + * Common usage: Same as send_pending + * Notes: + * - The number of sg entries must fit the max_send_sge of the QP. + * Providing bigger list of sg entries may lead to data corruption and + * segmentation fault. + * - Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_pending_sg_list)(struct ibv_qp *qp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags); + /* + * send_flush - To flush the pending messages. + * + * Note: Use ibv_exp_qp_burst_family_flags for the flags field + */ + int (*send_flush)(struct ibv_qp *qp); + /* + * send_burst - Send a list of 'num' messages (no send_flush required in this case) + */ + int (*send_burst)(struct ibv_qp *qp, struct ibv_sge *msg_list, uint32_t num, uint32_t flags); + /* + * recv_burst - Post a set of 'num' receive buffers. + * + * Note: One sge per message is supported by this function + */ + int (*recv_burst)(struct ibv_qp *qp, struct ibv_sge *msg_list, uint32_t num); +}; +.fi +.PP +.B WQ family: +.PP +.nf +struct ibv_exp_wq_family { + /* + * recv_sg_list - Post one scatter-gather(sg) receive buffer. + * + * Note: + * - The number of sg entries must fit the max_recv_sge of the WQ. + * Providing bigger list of sg entries may lead to data corruption and + * segmentation fault. + */ + int (*recv_sg_list)(struct ibv_exp_wq *wq, struct ibv_sge *sg_list, uint32_t num_sg); + /* + * recv_burst - Post a set of 'num' receive buffers. + * + * Note: One sge per message is supported by this function + */ + int (*recv_burst)(struct ibv_exp_wq *wq, struct ibv_sge *msg_list, uint32_t num); +}; +.fi +.PP +.B CQ family: +.PP +.nf +enum ibv_exp_cq_family_flags { + /* The cq_family_flags are applicable + * according to the existence of the + * related device capabilities flags */ + IBV_EXP_CQ_RX_IP_CSUM_OK = 1 << 0, /* IBV_EXP_DEVICE_RX_CSUM_IP_PKT or IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK = 1 << 1, /* IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_IPV4_PACKET = 1 << 2, /* IBV_EXP_DEVICE_RX_CSUM_IP_PKT or IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_IPV6_PACKET = 1 << 3, /* IBV_EXP_DEVICE_RX_CSUM_IP_PKT or IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT */ + IBV_EXP_CQ_RX_TUNNEL_PACKET = 1 << 4, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK = 1 << 5, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK = 1 << 6, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_IPV4_PACKET = 1 << 7, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + IBV_EXP_CQ_RX_OUTER_IPV6_PACKET = 1 << 8, /* IBV_EXP_DEVICE_VXLAN_SUPPORT */ + + /* Flags supported from CQ family version 1 */ + /* Multi-Packet RQ flag */ + IBV_EXP_CQ_RX_MULTI_PACKET_LAST_V1 = 1 << 9, /* Last packet on WR */ +}; + +/* All functions of CQ family included in CQ family version 1 */ +struct ibv_exp_cq_family { + int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max); + int32_t (*poll_length)(struct ibv_cq *cq, void *buf, uint32_t *inl); + int32_t (*poll_length_flags)(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags); +}; + +struct ibv_exp_cq_family_v1 { + /* + * poll_cnt - Poll up to 'max' valid completions + * + * The function returns the number of valid completions it + * managed to drain from the CQ. + * + * Usage example: In case a CQ is connected to one send-queue + * the application may use this function to get + * the number of the QP send-completions. + * + * Return value (n): + * n >= 0 : number extracted completions. + * n == -1 : operation failed. completion is not extracted. + * To extract this completion, ibv_poll_cq() must be used + */ + int32_t (*poll_cnt)(struct ibv_cq *cq, uint32_t max); + /* + * poll_length - Poll one receive completion and provide the related + * message length. + * + * The function returns only the length of the completed message. + * In case of inline received message the message will be copied + * to the provided buffer ('buf') and the '*inl' status will be set. + * The function extracts only completion of regular receive-messages. + * In case of send-message completion or SRQ receive-message completion + * it returns -1. + * + * Usage example: In case a CQ is connected to one receive-queue + * the application may use this function to get + * the size of the next received message. + * + * Return value (n): + * n > 0 : successful completion with positive length. + * *inl will be set to 1 if data was copied to buffer. + * + * 0 : Empty. + * n == -1 : operation failed. completion is not extracted. + * To extract this completion, ibv_poll_cq() must be used + */ + int32_t (*poll_length)(struct ibv_cq *cq, void *buf, uint32_t *inl); + /* + * poll_length_flags - Poll one receive completion and provide the related + * message length and completion flags. + * + * The same as poll_length but also retrieves completion flags as + * defined by the enum ibv_exp_cq_family_flags + */ + int32_t (*poll_length_flags)(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags); + /* + * poll_length_flags_mp_rq - Poll one receive completion and provide the related + * message length, packet-offset and completion flags. + * + * The same as poll_length_flags but: + * - Without the inline-receive support. + * - Also retrieves packet-offset (for multi-packet RQ). + */ + int32_t (*poll_length_flags_mp_rq)(struct ibv_cq *cq, uint32_t *offset, uint32_t *flags); +}; +.fi + +.PP +.B ibv_exp_release_intf() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). + +.SH "NOTES" +Application may call ibv_exp_query_intf for QPs in the following states: + IBV_QPS_INIT, IBV_QPS_RTR and IBV_QPS_RTS + +.SH "SEE ALSO" +.BR ibv_exp_create_res_domain (3) +.SH "AUTHORS" +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_exp_query_mkey.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_query_mkey.3 @@ -0,0 +1,48 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_QUERY_MKEY 3 2014-08-28 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_query_mkey \- query MR's attribute, for MRs created by ibv_exp_create_mr() function +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_query_mkey(struct ibv_mr " "*mr" ", struct ibv_exp_mkey_attr " "*attr"); +.fi +.SH "DESCRIPTION" +.B ibv_exp_query_mkey() +gets the attributes of the memory region (MR) +.I mr +into the argument +.I attr. +.PP +The argument +.I attr +is an ibv_exp_mkey_attr struct, as defined in . +.PP +.nf +struct ibv_exp_mkey_attr { +.in +8 +uint32_t max_klm_list_size; /* maximum number of MRs we can use to create the UMR */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; +.fi +.PP +The function +.B ibv_exp_query_mkey() +will update the +.I attr +struct with the actual \s-1mr\s0 values of the MR that were queried. +.PP +.SH "RETURN VALUE" +.B ibv_exp_query_mkey() +returns 0 on success, or the value of errno on failure. + +.SH "NOTES" +.PP +.SH "SEE ALSO" +.BR ibv_exp_create_mr (3), +.SH "AUTHORS" +.TP +Haggai Abramovsky Index: contrib/ofed/libibverbs/man/ibv_exp_query_values.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_query_values.3 @@ -0,0 +1,51 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_QUERY_VALUES 3 2014-11-13 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_query_values \- query values +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_query_values(struct ibv_context " "*context" ", int " "q_values" ", struct ibv_exp_values " "*values"); +.fi +.SH "DESCRIPTION" +.B ibv_exp_query_values() +queries the values defined in the bitmask +.I q_values +into the appropriate fields of +.I values. +.PP +The argument +.I q_values +is enum ibv_exp_values_comp_mask, as defined in . It's bitwise OR of one or more of the following flags: +.TP +.B IBV_EXP_VALUES_HW_CLOCK_NS \fR Query hardware clock in nano-seconds +.TP +.B IBV_EXP_VALUES_HW_CLOCK \fR Query hardware clock in cycles +.PP +The argument +.I values +is an ibv_exp_values struct, as defined in : +.nf +struct ibv_exp_values { +.in +8 +uint32_t comp_mask; /* Bitmask that is used as input to indicate which of the following fields exist and as output to + indicate which values were queried. use enum ibv_exp_values_comp_mask */ +uint64_t hwclock_ns; /* If requested, the queried value of hardware clock in nano-seconds */ +uint64_t hwclock; /* If requested, the queried value of +hardware clock in cycles*/ +.in -8 +}; +.fi +.PP +.SH "RETURN VALUE" +.B ibv_exp_query_values() +returns 0. + +.SH "NOTES" +.PP +.SH "SEE ALSO" +.SH "AUTHORS" +.TP +Majd Dibbiny Index: contrib/ofed/libibverbs/man/ibv_exp_reg_mr.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_reg_mr.3 @@ -0,0 +1,183 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_REG_MR 3 2014-04-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_reg_mr \- registers a memory region (MR) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_mr *ibv_exp_reg_mr(struct ibv_exp_reg_mr_in " "*in" ); +.fi +.SH "DESCRIPTION" +.B ibv_exp_reg_mr() +registers a memory region (MR) associated with the protection domain +.I in->pd\fR. +The MR's starting address is +.I in->addr +and its size is +.I in->length\fR. + +.PP +The argument +.I in +is an ibv_exp_reg_mr_in struct, as defined in . +.PP +.nf +struct ibv_exp_reg_mr_in { +.in +8 +struct ibv_pd *pd; /* Protection domain (PD) associated with the MR */ +void *addr; /* MR's starting address */ +size_t length; /* MR's length */ +uint64_t exp_access; /* Use ibv_exp_access_flags */ +uint32_t comp_mask; /* use ibv_exp_reg_mr_in_comp_mask */ +uint32_t create_flags /* use ibv_exp_reg_mr_create_flags */ +.in -8 +}; + +.fi +.I in->exp_access +describes the desired memory protection attributes; it is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_ACCESS_LOCAL_WRITE \fR Enable Local Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_READ\fR Enable Remote Read Access +.TP +.B IBV_EXP_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if supported) +.TP +.B IBV_EXP_ACCESS_MW_BIND\fR Enable Memory Window Binding +.TP +.B IBV_EXP_ACCESS_ALLOCATE_MR\fR Request the low level driver to allocate the memory used for backing the MR. Could improve performance in some cases. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_USER_READ\fR Enable sharing this MR for reading by user (application owner). +.TP +.B IBV_EXP_ACCESS_SHARED_MR_USER_WRITE\fR Enable sharing this MR for writing by user. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_GROUP_READ\fR Enable sharing this MR for reading by group (application group). +.TP +.B IBV_EXP_ACCESS_SHARED_MR_GROUP_WRITE\fR Enable sharing this MR for writing by group. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_OTHER_READ\fR Enable sharing this MR for reading by other. +.TP +.B IBV_EXP_ACCESS_SHARED_MR_OTHER_WRITE\fR Enable sharing this MR for writing by other. +.TP +.B IBV_EXP_ACCESS_NO_RDMA\fR Disable RDMA on shared MR. +.TP +.B IBV_EXP_ACCESS_ON_DEMAND\fR Create an on-demand paging MR. +.TP +.B IBV_EXP_ACCESS_RELAXED\fR Create an on-demand paging Relaxed MR. +.TP +.B IBV_EXP_ACCESS_RESERVED\fR Library's internal variable - used for validity checks. +.PP +If +.B IBV_EXP_ACCESS_REMOTE_WRITE +or +.B IBV_EXP_ACCESS_REMOTE_ATOMIC +is set, then +.B IBV_EXP_ACCESS_LOCAL_WRITE +must be set too. +.PP +If +.B IBV_EXP_ACCESS_ALLOCATE_MR +is used, +.I in->addr +must be NULL, and +.B ibv_exp_reg_mr() +will allocate a memory block automatically, which its address will be page aligned. +This block will be freed implicitly when +.B ibv_dereg_mr() +is called. +.PP +If one of +.B IBV_EXP_ACCESS_SHARED_XXX +flags is used +.I in->addr +and +.I in->length +must be page aligned. Additionally, the pages composing the MR must not be Anonymous/LRU ones. If +.B IBV_EXP_ACCESS_ALLOCATE_MR +is set this behavior is guaranteed. +.PP +If +.B IBV_EXP_ACCESS_RELAXED +is set, then +.B IBV_EXP_ACCESS_ON_DEMAND +must be set too and +.I in->length +shouldn't exceed 128MB. Using this access flag allows the memory registration to take a +more relaxed approach with the resulting MR. The resulting MR may cover a +larger memory region than requested, may allow more operations than the access +flags that were passed to it and it's rkey may remain valid even after +ibv_dereg_mr is called. It's used to fasten the ODP MR registration method for +remote access. +.PP +Local read access is always enabled for the MR. + +.fi +.I in->create_flags +describes the desired memory create flags; it is either 0 or the bitwise OR of one or more of the following flags: +.PP +.B IBV_EXP_REG_MR_CREATE_CONTIG\fR Request the driver to allocate the memory, with the address passed in +.B in->addr + +.I in->comp_mask +describes which fields after comp_mask are present and valid; it is either 0 or the bitwise OR of one or more of the following flags: +.PP +.B IBV_EXP_REG_MR_CREATE_FLAGS \fR the field +.B in->create_flags +is present and the driver should use it. + +.SH "RETURN VALUE" +.B ibv_exp_reg_mr() +returns a pointer to the registered MR, or NULL if the request fails. +The local key (\fBL_Key\fR) field +.B lkey +is used as the lkey field of struct ibv_sge when posting buffers with +ibv_post_* verbs, and the remote key (\fBR_Key\fR) +field +.B rkey +is used by remote processes to perform Atomic and RDMA operations. The remote process places this +.B rkey +as the rkey field of struct ibv_send_wr passed to the ibv_post_send function. +If +.B IBV_EXP_ACCESS_ALLOCATE_MR\fR is used, the field +.B in->addr\fR will hold the address to the allocated memory block. + +if +.B IBV_REG_MR_CREATE_CONTIG\fR is used, +.B IBV_EXP_ACCESS_ALLOCATE_MR +must not be used. If the user passes null in +.B in->addr +, the driver will try to allocate an arbitrary address and register it. If the user passes a specific address; that is not null; the driver will try to allocate the passed address and register it. + +.SH "NOTES" +.PP +The user can register an Implicit ODP MR, which provides him an implicit lkey +that represents their complete address space. Implicit ODP MR is limited to +local access permissions (local read or write). It only has a valid lkey, it's +rkey is invalid. There's a limit on the size of operations that can use this +lkey and it is 128MB. + +In order to register an Implicit ODP MR, in addition to the +IBV_EXP_ACCESS_ON_DEMAND access flag, use +.B in->addr = 0 +and +.B in->length = IBV_EXP_IMPLICIT_MR_SIZE. + +.SH "SEE ALSO" +.BR ibv_dereg_mr (3), +.BR ibv_alloc_pd (3), +.BR ibv_post_send (3), +.BR ibv_post_recv (3), +.BR ibv_post_srq_recv (3), +.BR ibv_reg_shared_mr (3) +.SH "AUTHORS" +.TP +Dotan Barak +.TP +Majd Dibbiny +.TP +Moshe Lazer Index: contrib/ofed/libibverbs/man/ibv_exp_reg_shared_mr.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_reg_shared_mr.3 @@ -0,0 +1,105 @@ +.\" -*- nroff -*- +.\" +.TH IBV_EXP_REG_SHARED_MR 3 2014-04-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_reg_shared_mr \- register a shared memory region (MR) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_mr *ibv_exp_reg_shared_mr(struct ibv_exp_reg_shared_mr_in " *in "); " +.fi +.SH "DESCRIPTION" +.fi +.B ibv_exp_reg_shared_mr() +registers a memory region (MR) associated with the protection domain +.I in->pd\fR. +.I in->mr_handle +is the identifier of the MR that its physical memory going to be shared by this MR. +This +.I in->mr_handle +is returned as part of +.I struct ibv_mr +and valid when original MR was created with sharing access mode. +The +.I in->addr +can be NULL and this is the common usage.If +.I in->addr +is not NULL, then the kernel takes it as a hint about the returned +.I addr\fR. +The argument +.I in +is an ibv_exp_reg_shared_mr_in struct, as defined in . + +struct ibv_exp_reg_shared_mr_in { +.in +8 +.nf +uint32_t mr_handle; +struct ibv_pd *pd; +void *addr; +uint64_t exp_access; /* use ibv_exp_access_flags from verbs_exp.h */ +uint32_t comp_mask; /* reserved for future growth (must be 0) */ +.in -8 +}; +.fi +The struct field +.I exp_access +describes the desired memory protection attributes; it expects not to exceed the given permissions when the original MR was created. +it is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_ACCESS_LOCAL_WRITE \fR Enable Local Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_READ\fR Enable Remote Read Access +.TP +.B IBV_EXP_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if supported) +.TP +.B IBV_EXP_ACCESS_MW_BIND\fR Enable Memory Window Binding +.PP +If +.B IBV_EXP_ACCESS_REMOTE_WRITE +or +.B IBV_EXP_ACCESS_REMOTE_ATOMIC +is set, then +.B IBV_EXP_ACCESS_LOCAL_WRITE +must be set too. +.PP +Local read access is always enabled for the MR. +.nf +.fi +.SH "RETURN VALUE" +.B ibv_exp_reg_shared_mr() +returns a pointer to the registered MR, or NULL if the request fails. +The local key (\fBL_Key\fR) field +.B lkey +is used as the lkey field of struct ibv_sge when posting buffers with +ibv_post_* verbs, and the remote key (\fBR_Key\fR) +field +.B rkey +is used by remote processes to perform Atomic and RDMA operations. The remote process places this +.B rkey +as the rkey field of struct ibv_send_wr passed to the ibv_post_send function. +.SH "NOTES" +Once +.B ibv_exp_reg_shared_mr() +has succeeded its memory is valid even if original MR was released by +.B ibv_dereg_mr() \fR call. +To release this shared MR +.B ibv_dereg_mr() +should be used, the returned +.I addr +must not free explicitly it is freed internally as part of +.B ibv_dereg_mr() + +.SH "SEE ALSO" +.BR ibv_post_send (3), +.BR ibv_post_recv (3), +.BR ibv_post_srq_recv (3), +.BR ibv_reg_mr (3) +.SH "AUTHORS" +.TP +Dotan Barak +.TP +Majd Dibbiny Index: contrib/ofed/libibverbs/man/ibv_exp_rereg_mr.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_exp_rereg_mr.3 @@ -0,0 +1,82 @@ +.\" -*- nroff -*- +.\" +.TH IBV_REREG_MR 3 2014-08-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_exp_rereg_mr \- re-register a memory region (MR) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_exp_rereg_mr(struct ibv_mr " "*mr" ", int " " flags" , +.BI " struct ibv_pd * " "pd" ", void " " *addr", +.BI " size_t " " length" ", uint64_t " " access", +.BI " struct ibv_exp_rereg_mr_attr *" " attr"); +.fi +.fi +.SH "DESCRIPTION" +.B ibv_exp_rereg_mr() +Modifies the attributes of an existing memory region (MR) +.I mr\fR. +Conceptually, this call performs the functions deregister memory region +followed by register memory region. Where possible, +resources are reused instead of deallocated and reallocated. +.PP +.I flags\fR +is a bit-mask used to indicate which of the following properties of the memory region are being modified. Flags should be a combination (bit field) of: +.PP +.TP +.B IBV_EXP_REREG_MR_CHANGE_TRANSLATION \fR Change translation (location and length) +.TP +.B IBV_EXP_REREG_MR_CHANGE_PD \fR Change protection domain +.TP +.B IBV_EXP_REREG_MR_CHANGE_ACCESS \fR Change access flags +.PP +When +.B IBV_EXP_REREG_MR_CHANGE_PD +is used, +.I pd\fR +represents the new PD this MR should be registered to. +.br +When +.B IBV_EXP_REREG_MR_CHANGE_TRANSLATION +is used, +.I addr\fR. +represents the virtual address (user-space pointer) of the new MR, while +.I length\fR +represents its length. +.PP +The access and other flags are represented in the field +.I access\fR. +This field describes the desired memory protection attributes; it is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_EXP_ACCESS_LOCAL_WRITE \fR Enable Local Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access +.TP +.B IBV_EXP_ACCESS_REMOTE_READ\fR Enable Remote Read Access +.TP +.B IBV_EXP_ACCESS_ALLOCATE_MR\fR Enable contiguous pages allocation +.PP +When +.B IBV_ACCESS_ALLOCATE_MR +is used, both +.I addr +and +.I length +should be NULL and 0 respectively, and the allocation is done internally. +.PP +.I attr\fR +is available for future extensions. +.SH "RETURN VALUE" +.B ibv_exp_rereg_mr() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +If the memory re-registration call failed, the MR can't be used. +Even on a failure, the user still needs to call ibv_dereg_mr on this MR. +.SH "SEE ALSO" +.BR ibv_reg_mr (3), +.BR ibv_dereg_mr (3), +.SH "AUTHORS" +.TP +Matan Barak Index: contrib/ofed/libibverbs/man/ibv_fork_init.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_fork_init.3 +++ contrib/ofed/libibverbs/man/ibv_fork_init.3 @@ -23,6 +23,11 @@ spaces via an .B exec() operation. +.PP +When +.B ibv_fork_init()\fR is used at the parent, the child process should call +.B exec*()\fR or +.B exit()\fR immediately after fork. .SH "RETURN VALUE" .B ibv_fork_init() returns 0 on success, or the value of errno on failure (which indicates the failure reason). @@ -38,15 +43,25 @@ .BR RDMAV_FORK_SAFE or .BR IBV_FORK_SAFE -has the same effect as calling +to any value has the same effect as calling .B ibv_fork_init()\fR. .PP +Setting the environment variable +.BR RDMAV_HUGEPAGES_SAFE +to any value tells the library to check the underlying page size used by the +kernel for memory regions. This is required if an application uses huge +pages either directly or indirectly via a library such as libhugetlbfs. +.PP Calling .B ibv_fork_init() will reduce performance due to an extra system call for every memory registration, and the additional memory allocated to track memory regions. The precise performance impact depends on the workload and usually will not be significant. +.PP +Setting +.BR RDMAV_HUGEPAGES_SAFE +adds further overhead to all memory registrations. .SH "SEE ALSO" .BR fork (2), .BR wait (2), @@ -55,4 +70,4 @@ .BR ibv_get_device_list (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_get_async_event.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_get_async_event.3 +++ contrib/ofed/libibverbs/man/ibv_get_async_event.3 @@ -81,6 +81,8 @@ .B IBV_EVENT_SM_CHANGE \fR SM was changed on a port .TP .B IBV_EVENT_CLIENT_REREGISTER \fR SM sent a CLIENT_REREGISTER request to a port +.TP +.B IBV_EVENT_GID_CHANGE \fR GID table was changed on a port .PP .I CA events: .TP @@ -159,4 +161,4 @@ .BR ibv_open_device (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_get_cq_event.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_get_cq_event.3 +++ contrib/ofed/libibverbs/man/ibv_get_cq_event.3 @@ -182,4 +182,4 @@ .SH "AUTHORS" .TP Dotan Barak -.RI < dotanb@mellanox.co.il > +.RI < dotanba@gmail.com > Index: contrib/ofed/libibverbs/man/ibv_get_device_guid.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_get_device_guid.3 +++ contrib/ofed/libibverbs/man/ibv_get_device_guid.3 @@ -10,7 +10,7 @@ .BI "uint64_t ibv_get_device_guid(struct ibv_device " "*device" "); .fi .SH "DESCRIPTION" -.B ibv_get_device_name() +.B ibv_get_device_guid() returns the Global Unique IDentifier (GUID) of the RDMA device .I device\fR. .SH "RETURN VALUE" @@ -22,4 +22,4 @@ .BR ibv_open_device (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_get_device_list.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_get_device_list.3 +++ contrib/ofed/libibverbs/man/ibv_get_device_list.3 @@ -57,4 +57,4 @@ .BR ibv_open_device (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_get_device_name.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_get_device_name.3 +++ contrib/ofed/libibverbs/man/ibv_get_device_name.3 @@ -22,4 +22,4 @@ .BR ibv_open_device (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_get_srq_num.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_get_srq_num.3 @@ -0,0 +1,32 @@ +.\" -*- nroff -*- +.\" +.TH IBV_GET_SRQ_NUM 3 2013-06-26 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_get_srq_num \- return srq number associated with the given shared receive queue (SRQ) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_get_srq_num(struct ibv_srq " "*srq" , +.BI " uint32_t " "*srq_num" ); +.fi +.SH "DESCRIPTION" +.B ibv_get_srq_num() +return srq number associated with the given shared receive queue +The argument +.I srq +is an ibv_srq struct, as defined in . +.I srq_num +is an output parameter that holds the returned srq number. +.PP +.nf +.SH "RETURN VALUE" +.B ibv_get_srq_num() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_srq (3), +.BR ibv_create_srq_ex (3) +.SH "AUTHORS" +.TP +Yishai Hadas Index: contrib/ofed/libibverbs/man/ibv_intf.1 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_intf.1 @@ -0,0 +1,92 @@ +.TH IBV_INTF 1 "May 03, 2015" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_intf \- Test application to utilize different send/receive/poll interfaces + +.SH SYNOPSIS +.B ibv_intf +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-t inline-recv] [\-S send-verbs] +[\-R recv-verbs] [\-P poll-verbs] [\-c cpus-list] [\-b burst] +[\-T num-threads] [\-C] [\-A] \fBHOSTNAME\fR + +.B ibv_intf +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-t inline-recv] [\-S send-verbs] +[\-R recv-verbs] [\-P poll-verbs] [\-c cpus-list] [\-b burst] +[\-T num-threads] [\-C] [\-A] + +.SH DESCRIPTION +.PP +Run efficient multi-threaded post\-send/recive test over InfiniBand via the reliable +connected (RC) transport. +Use different interfaces to send/recive messages and poll completions. +Unless the '\-A' option is set, the application utilize the resource-domain verbs object to gain performance. +The application allocates one resource-domain per application thread. +.TP +By using the '\-S', '\-R' and '\-P' options the application covers the following verbs interfaces: +.TP + \-S + \tS_NORM \- ibv_post_send + \tS_PEND \- send_pending (member of ibv_exp_qp_burst_family) + \tS_PEND_INL \- send_pending_inline (member of ibv_exp_qp_burst_family) + \tS_PEND_SG_LIST \- send_pending_sg_list (member of ibv_exp_qp_burst_family) + \tS_BURST \- send_burst (member of ibv_exp_qp_burst_family) +.TP + \-R + \tR_NORM \- ibv_post_recv + \tR_BURST \- recv_burst (member of ibv_exp_qp_burst_family) +.TP + \-P + \tP_NORM \- ibv_poll_cq + \tP_CNT \- poll_cnt (member of ibv_exp_cq_family) + \tP_LEN \- poll_length (member of ibv_exp_cq_family) + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR= listen on/connect to port (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR= use IB device (default mlx4_0) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR= use port of IB device (default 1) +.TP +\fB\-s\fR, \fB\-\-size\fR= size of message (default 64 max size 65,536) +.TP +\fB\-m\fR, \fB\-\-mtu\fR= path MTU (default 4,096) +.TP +\fB\-r\fR, \fB\-\-rx\-depth\fR= receive queue size (default 2,100) +.TP +\fB\-n\fR, \fB\-\-iters\fR= number of messages (default 1,000,000) +.TP +\fB\-l\fR, \fB\-\-sl\fR= service level value (default 0) +.TP +\fB\-t\fR, \fB\-\-inline\-recv\fR= size of inline\-recv (default 0) +.TP +\fB\-S\fR, \fB\-\-send\-verb\fR= send verb interface to use S_NORM/S_PEND/S_PEND_INL/S_PEND_SG_LIST/S_BURST (default S_PEND) +.TP +\fB\-R\fR, \fB\-\-recv\-verb\fR= recv verb interface to use R_NORM/R_BURST (default R_BURST) +.TP +\fB\-P\fR, \fB\-\-poll\-verb\fR= poll verb interface to use P_NORM/P_CNT/P_LEN (default send: P_CNT recv: P_LEN) +.TP +\fB\-c\fR, \fB\-\-cpus\-list\fR= CPUs list to run on (default [0..5],[12..17]) +.TP +\fB\-b\fR, \fB\-\-burst\fR= size of send/recv wr burst (default 10) +.TP +\fB\-T\fR, \fB\-\-num\-threads\fR= Number of threads to run (default 1) +.TP +\fB\-C\fR, \fB\-\-check\-data\fR check the data received (default no\-checks) +.TP +\fB\-A\fR, \fB\-\-avoid\-res\-domain\fR avoid usage of resource domain (default use res\-domain) + +.SH SEE ALSO +.BR ibv_rc_pingpong (1), +.BR ibv_uc_pingpong (1), +.BR ibv_ud_pingpong (1), +.BR ibv_srq_pingpong (1) + +.SH AUTHORS +.TP +Moshe Lazer +.RI < moshel@mellanox.com > Index: contrib/ofed/libibverbs/man/ibv_modify_qp.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_modify_qp.3 +++ contrib/ofed/libibverbs/man/ibv_modify_qp.3 @@ -159,6 +159,44 @@ RTS \fB IBV_QP_STATE, IBV_QP_SQ_PSN, IBV_QP_MAX_QP_RD_ATOMIC, \fR \fB IBV_QP_RETRY_CNT, IBV_QP_RNR_RETRY, IBV_QP_TIMEOUT \fR .fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_RAW_PACKET\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_QP_STATE, IBV_QP_PORT\fR +RTR \fB IBV_QP_STATE\fR +RTS \fB IBV_QP_STATE\fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_XRC_RECV\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\ +-\-\- +Init \fB IBV_QP_STATE, IBV_QP_PKEY_INDEX, IBV_QP_PORT, \fR + \fB IBV_QP_ACCESS_FLAGS \fR +RTR \fB IBV_QP_STATE, IBV_QP_AV, IBV_QP_PATH_MTU, \fR + \fB IBV_QP_DEST_QPN, IBV_QP_RQ_PSN, \fR + \fB IBV_QP_MAX_DEST_RD_ATOMIC, IBV_QP_MIN_RNR_TIMER \fR +RTS \fB IBV_QP_STATE, IBV_QP_SQ_PSN, IBV_QP_TIMEOUT \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_XRC_SEND\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\ +-\-\- +Init \fB IBV_QP_STATE, IBV_QP_PKEY_INDEX, IBV_QP_PORT, \fR + \fB IBV_QP_ACCESS_FLAGS \fR +RTR \fB IBV_QP_STATE, IBV_QP_AV, IBV_QP_PATH_MTU, \fR + \fB IBV_QP_DEST_QPN, IBV_QP_RQ_PSN \fR +RTS \fB IBV_QP_STATE, IBV_QP_SQ_PSN, IBV_QP_MAX_QP_RD_ATOMIC, \fR + \fB IBV_QP_RETRY_CNT, IBV_QP_RNR_RETRY, IBV_QP_TIMEOUT \fR +.fi .SH "SEE ALSO" .BR ibv_create_qp (3), .BR ibv_destroy_qp (3), @@ -166,4 +204,4 @@ .BR ibv_create_ah (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_modify_srq.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_modify_srq.3 +++ contrib/ofed/libibverbs/man/ibv_modify_srq.3 @@ -60,4 +60,4 @@ .BR ibv_query_srq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_open_device.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_open_device.3 +++ contrib/ofed/libibverbs/man/ibv_open_device.3 @@ -40,4 +40,4 @@ .BR ibv_query_pkey (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_open_qp.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_open_qp.3 @@ -0,0 +1,51 @@ +.\" -*- nroff -*- +.\" +.TH IBV_OPEN_QP 3 2011-08-12 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_open_qp \- open a shareable queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_qp *ibv_open_qp(struct ibv_context " "*context" , +.BI " struct ibv_qp_open_attr " "*qp_open_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_open_qp() +opens an existing queue pair (QP) associated with the extended protection domain +.I xrcd\fR. +The argument +.I qp_open_attr +is an ibv_qp_open_attr struct, as defined in . +.PP +.nf +struct ibv_qp_open_attr { +.in +8 +uint32_t comp_mask; /* Identifies valid fields */ +uint32_t qp_num; /* QP number */ +struct *ibv_xrcd; /* XRC domain */ +void *qp_context; /* Associated context of the QP */ +enum ibv_qp_type qp_type; /* QP transport service type */ +.fi +.PP +.B ibv_destroy_qp() +closes the opened QP and destroys the underlying QP if it has no +other references. +.I qp\fR. +.SH "RETURN VALUE" +.B ibv_open_qp() +returns a pointer to the opened QP, or NULL if the request fails. +Check the QP number (\fBqp_num\fR) in the returned QP. +.SH "NOTES" +.B ibv_open_qp() +will fail if a it is asked to open a QP that does not exist within +the xrcd with the specified qp_num and qp_type. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_create_qp (3), +.BR ibv_create_qp_ex (3), +.BR ibv_modify_qp (3), +.BR ibv_query_qp (3) +.SH "AUTHORS" +.TP +Sean Hefty Index: contrib/ofed/libibverbs/man/ibv_open_xrcd.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_open_xrcd.3 @@ -0,0 +1,76 @@ +.\" -*- nroff -*- +.\" +.TH IBV_OPEN_XRCD 3 2011-06-17 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_open_xrcd, ibv_close_xrcd \- open or close an XRC protection domain (XRCDs) +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "struct ibv_xrcd *ibv_open_xrcd(struct ibv_context " "*context" "," +.BI " struct ibv_xrcd_init_attr " "*xrcd_init_attr" ); +.sp +.BI "int ibv_close_xrcd(struct ibv_xrcd " "*xrcd" ); +.fi +.SH "DESCRIPTION" +.B ibv_open_xrcd() +open an XRC domain for the RDMA device context +.I context +.I xrcd_init_attr +is an ibv_xrcd_init_attr struct, as defined in . +.PP +.nf +struct ibv_xrcd_init_attr { +.in +8 +uint32_t comp_mask; /* Identifies valid fields */ +int fd; +int oflag; +.fi +.PP +.I fd +is the file descriptor to associate with the XRCD. +.I oflag +describes the desired creation attributes. It is a bitwise OR of zero or more +of the following flags: +.PP +.TP +.B O_CREAT +Indicates that an XRCD should be created and associated with the inode referenced +by the given fd. If the XRCD exists, this flag has no effect except as noted under +.BR O_EXCL +below.\fR +.TP +.B O_EXCL +If +.BR O_EXCL +and +.BR O_CREAT +are set, open will fail if an XRCD associated with the inode exists. +.PP +If +.I fd +equals -1, no inode is associated with the XRCD. To indicate that XRCD should be created, use +.I oflag += +.B O_CREAT\fR. +.PP +.B ibv_close_xrcd() +closes the XRCD +.I xrcd\fR. +If this is the last reference, the XRCD will be destroyed. +.SH "RETURN VALUE" +.B ibv_open_xrcd() +returns a pointer to the opened XRCD, or NULL if the request fails. +.PP +.B ibv_close_xrcd() +returns 0 on success, or the value of errno on failure (which indicates the +failure reason). +.SH "NOTES" +.B ibv_close_xrcd() +may fail if any other resource is still associated with the XRCD being closed. +.SH "SEE ALSO" +.BR ibv_create_srq_ex (3), +.BR ibv_create_qp_ex (3), +.SH "AUTHORS" +.TP +Sean Hefty Index: contrib/ofed/libibverbs/man/ibv_poll_cq.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_poll_cq.3 +++ contrib/ofed/libibverbs/man/ibv_poll_cq.3 @@ -74,4 +74,4 @@ .BR ibv_post_recv (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_post_recv.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_post_recv.3 +++ contrib/ofed/libibverbs/man/ibv_post_recv.3 @@ -73,4 +73,4 @@ .BR ibv_poll_cq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_post_send.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_post_send.3 +++ contrib/ofed/libibverbs/man/ibv_post_send.3 @@ -60,7 +60,20 @@ } ud; .in -8 } wr; -uint32_t xrc_remote_srq_num; /* SRQ number of the destination XRC */ +union { +.in +8 +union { +.in +8 +struct { +.in +8 +uint32_t remote_srqn; /* SRQ number of the destination SRQ */ +.in -8 +} xrc; +.in -8 +} qp_type; +uint32_t xrc_remote_srq_num; /* SRQ number of the destination SRQ */ +.in -8 +}; .in -8 }; .sp @@ -77,15 +90,15 @@ Each QP Transport Service Type supports a specific set of opcodes, as shown in the following table: .PP .nf -OPCODE | IBV_QPT_UD | IBV_QPT_UC | IBV_QPT_RC | IBV_QPT_XRC -\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\- -IBV_WR_SEND | X | X | X | X -IBV_WR_SEND_WITH_IMM | X | X | X | X -IBV_WR_RDMA_WRITE | | X | X | X -IBV_WR_RDMA_WRITE_WITH_IMM | | X | X | X -IBV_WR_RDMA_READ | | | X | X -IBV_WR_ATOMIC_CMP_AND_SWP | | | X | X -IBV_WR_ATOMIC_FETCH_AND_ADD | | | X | X +OPCODE | IBV_QPT_UD | IBV_QPT_UC | IBV_QPT_RC +\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\- +IBV_WR_SEND | X | X | X +IBV_WR_SEND_WITH_IMM | X | X | X +IBV_WR_RDMA_WRITE | | X | X +IBV_WR_RDMA_WRITE_WITH_IMM | | X | X +IBV_WR_RDMA_READ | | | X +IBV_WR_ATOMIC_CMP_AND_SWP | | | X +IBV_WR_ATOMIC_FETCH_AND_ADD | | | X .fi .PP The attribute send_flags describes the properties of the \s-1WR\s0. It is either 0 or the bitwise \s-1OR\s0 of one or more of the following flags: @@ -115,11 +128,10 @@ after the call returns. .SH "SEE ALSO" .BR ibv_create_qp (3), -.BR ibv_create_xrc_rcv_qp (3), .BR ibv_create_ah (3), .BR ibv_post_recv (3), .BR ibv_post_srq_recv (3), .BR ibv_poll_cq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_post_srq_recv.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_post_srq_recv.3 +++ contrib/ofed/libibverbs/man/ibv_post_srq_recv.3 @@ -65,4 +65,4 @@ .BR ibv_poll_cq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_query_device.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_query_device.3 +++ contrib/ofed/libibverbs/man/ibv_query_device.3 @@ -81,4 +81,4 @@ .BR ibv_query_gid (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_query_gid.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_query_gid.3 +++ contrib/ofed/libibverbs/man/ibv_query_gid.3 @@ -30,4 +30,4 @@ .BR ibv_query_pkey (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_query_pkey.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_query_pkey.3 +++ contrib/ofed/libibverbs/man/ibv_query_pkey.3 @@ -30,4 +30,4 @@ .BR ibv_query_gid (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_query_port.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_query_port.3 +++ contrib/ofed/libibverbs/man/ibv_query_port.3 @@ -48,6 +48,9 @@ .in -8 }; .sp +possible values for the link layer field are IBV_LINK_LAYER_INFINIBAND, +IBV_LINK_LAYER_ETHERNET, or IBV_LINK_LAYER_UNSPECIFIED. +.sp .fi .SH "RETURN VALUE" .B ibv_query_port() @@ -59,4 +62,4 @@ .BR ibv_create_ah (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_query_qp.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_query_qp.3 +++ contrib/ofed/libibverbs/man/ibv_query_qp.3 @@ -87,4 +87,4 @@ .BR ibv_create_ah (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_query_srq.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_query_srq.3 +++ contrib/ofed/libibverbs/man/ibv_query_srq.3 @@ -41,4 +41,4 @@ .BR ibv_modify_srq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_rate_to_mbps.3 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_rate_to_mbps.3 @@ -0,0 +1,45 @@ +.\" -*- nroff -*- +.\" +.TH IBV_RATE_TO_MBPS 3 2012-03-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +.nf +ibv_rate_to_mbps \- convert IB rate enumeration to Mbit/sec +.sp +mbps_to_ibv_rate \- convert Mbit/sec to an IB rate enumeration +.SH "SYNOPSIS" +.nf +.B #include +.sp +.BI "int ibv_rate_to_mbps(enum ibv_rate " "rate" "); +.sp +.BI "enum ibv_rate mbps_to_ibv_rate(int " "mbps" "); +.fi +.SH "DESCRIPTION" +.B ibv_rate_to_mbps() +converts the IB transmission rate enumeration +.I rate +to a number of Mbit/sec. For example, if +.I rate +is +.BR IBV_RATE_5_GBPS\fR, +the value 5000 will be returned (5 Gbit/sec = 5000 Mbit/sec). +.PP +.B mbps_to_ibv_rate() +converts the number of Mbit/sec +.I mult +to an IB transmission rate enumeration. For example, if +.I mult +is 5000, the rate enumeration +.BR IBV_RATE_5_GBPS +will be returned. +.SH "RETURN VALUE" +.B ibv_rate_to_mbps() +returns the number of Mbit/sec. +.PP +.B mbps_to_ibv_rate() +returns the enumeration representing the IB transmission rate. +.SH "SEE ALSO" +.BR ibv_query_port (3) +.SH "AUTHORS" +.TP +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_rate_to_mult.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_rate_to_mult.3 +++ contrib/ofed/libibverbs/man/ibv_rate_to_mult.3 @@ -43,4 +43,4 @@ .BR ibv_query_port (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_reg_mr.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_reg_mr.3 +++ contrib/ofed/libibverbs/man/ibv_reg_mr.3 @@ -73,4 +73,4 @@ .BR ibv_post_srq_recv (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_req_notify_cq.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_req_notify_cq.3 +++ contrib/ofed/libibverbs/man/ibv_req_notify_cq.3 @@ -40,4 +40,4 @@ .BR ibv_get_cq_event (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_resize_cq.3 =================================================================== --- contrib/ofed/libibverbs/man/ibv_resize_cq.3 +++ contrib/ofed/libibverbs/man/ibv_resize_cq.3 @@ -39,4 +39,4 @@ .BR ibv_destroy_cq (3) .SH "AUTHORS" .TP -Dotan Barak +Dotan Barak Index: contrib/ofed/libibverbs/man/ibv_shared_mr.1 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_shared_mr.1 @@ -0,0 +1,37 @@ +.TH IBV_SHARED_MR 1 "August 28, 2012" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_shared_mr \- simple InfiniBand program which demonstrates the usage of a shared memory region + +.SH SYNOPSIS +.B ibv_shared_mr +[\-p port] [\-d device] [\-s size] [\-n no-rdma] +\fBHOSTNAME\fR + +.SH DESCRIPTION +.PP +Run a simple shared memory region test over InfiniBand. +Server creates a shared memory region,client gets its ID and writes some data directly to +its memory, server verifies the data. +Note - both client and server must run on same machine. + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR + messages of size \fISIZE\fR (default 4096) +.TP +\fB\-s\fR, \fB\-\-no-rdma\fR +shared memory region is used only for shared memory purposes - no rdma. +.SH AUTHORS +.TP +Yishai Hadas +.RI + Index: contrib/ofed/libibverbs/man/ibv_task_pingpong.1 =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/man/ibv_task_pingpong.1 @@ -0,0 +1,79 @@ +.TH IBV_TASK_PINGPONG 1 2013-03-10 "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_task_pingpong \- ping-pong test demonstrates using of verbs_post_task(). + +.SH SYNOPSIS +.B ibv_task_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-e] [\-m mtu] +[\-c calc] [\-t op_type] [\-o operands] [\-w wait] +\fBHOSTNAME\fR + +.B ibv_task_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-e] [\-m mtu] +[\-c calc] [\-t op_type] [\-o operands] [\-w wait] + +.SH DESCRIPTION +.PP +Run a simple ping-pong test over InfiniBand via the reliable +connected (RC) transport using TASKs based on WAIT, CALC, SEND_EN +work requests. + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +use IB port \fIPORT\fR (default port 1) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR +ping-pong messages of size \fISIZE\fR (default 4096) +.TP +\fB\-r\fR, \fB\-\-rx\-depth\fR=\fIDEPTH\fR +post \fIDEPTH\fR receives at a time (default 1000) +.TP +\fB\-n\fR, \fB\-\-iters\fR=\fIITERS\fR +perform \fIITERS\fR message exchanges (default 1000) +.TP +\fB\-l\fR, \fB\-\-sl\fR=\fISL\fR +use \fISL\fR as the service level value of the QP (default 0) +.TP +\fB\-e\fR, \fB\-\-events\fR +sleep while waiting for work completion events (default is to poll for +completions) +.TP +\fB\-m\fR, \fB\-\-mtu\fR=\fISIZE\fR +path MTU (default 4096) +.TP +\fB\-c\fR, \fB\-\-calc\fR=\fIOPERATION\fR +calc operation +.TP +\fB\-t\fR, \fB\-\-op_type\fR=\fITYPE\fR +calc operands type +.TP +\fB\-o\fR, \fB\-\-operands\fR=\fIO1,O2...\fR +comma separated list of operands +.TP +\fB\-w\fR, \fB\-\-wait_cq\fR=\fIWAIT\fR +wait for enties on CQ + +.SH SEE ALSO +.BR ibv_uc_pingpong (1), +.BR ibv_ud_pingpong (1), +.BR ibv_srq_pingpong (1) + +.SH AUTHORS +.TP +Igor Ivanov +.RI < Igor.Ivanov@itseez.com > +.TP +Roland Dreier +.RI < rolandd@cisco.com > Index: contrib/ofed/libibverbs/man/verbs.7 =================================================================== --- contrib/ofed/libibverbs/man/verbs.7 +++ contrib/ofed/libibverbs/man/verbs.7 @@ -1,6 +1,6 @@ .\" -*- nroff -*- .\" -.TH VERBS 7 2008-02-25 libibverbs "Libibverbs Programmer's Manual" +.TH VERBS 7 2013-08-22 libibverbs "Libibverbs Programmer's Manual" .SH "NAME" verbs \- Infiniband verbs library .SH "SYNOPSIS" @@ -8,16 +8,14 @@ .B #include .fi .SH "DESCRIPTION" -This library is an implementation of the verbs based on the Infiniband specification volume 1.2 chapter 11. It handles the control path of creating, modifying, querying and destroying resources such as Protection Domains (PD), Completion Queues (CQ), Queue-Pairs (QP), Shared Receive Queues (SRQ), Address Handles (AH), Memory Regions (MR). It also handles sending and receiving data posted to QPs and SRQs, getting completions from CQs using polling and completions events. +This library is an implementation of the RDMA verbs for both Infiniband (according to the Infiniband specification volume 1, release 1.2.1) and iWarp. It handles the control path of creating, modifying, querying and destroying resources such as Protection Domains (PD), Completion Queues (CQ), Queue-Pairs (QP), Shared Receive Queues (SRQ), Address Handles (AH), Memory Regions (MR). It also handles sending and receiving data posted to QPs and SRQs, getting completions from CQs using polling and completions events. The control path is implemented through system calls to the uverbs kernel module which further calls the low level HW driver. The data path is implemented through calls made to low level HW library which in most cases interacts directly with the HW providing kernel and network stack bypass (saving context/mode switches) along with zero copy and an asynchronous I/O model. Typically, under network and RDMA programming, there are operations which involve interaction with remote peers (such as address resolution and connection establishment) and remote entities (such as route resolution and joining a multicast group under IB), where a resource managed through IB verbs such as QP or AH would be eventually created or effected from this interaction. In such cases, applications whose addressing semantics is based on IP can use librdmacm (see rdma_cm(7)) which works in conjunction with libibverbs. -This library is thread safe library and verbs can be called from every thread in the process (the same resource can even be handled from different threads, for example: ibv_poll_cq can be called from more than one thread). - -However, it is up to the user to stop working with a resource after it was destroyed (by the same thread or by any other thread), this may result a segmentation fault. +This library is a thread safe library and verbs can be called from every thread in the process. The same resource can even be handled from different threads (the atomicity of the operations is guaranteed). However, it is up to the user to stop working with a resource after it was destroyed (by the same thread or by any other thread), not doing so may result a segmentation fault. If fork (or any other system call that perform fork directly or indirectly) is being used, please see ibv_fork_init(3). @@ -72,6 +70,9 @@ size_t length, enum ibv_access_flags access); int ibv_dereg_mr(struct ibv_mr *mr); +struct ibv_mr *ibv_reg_shared_mr(uint32_t mr_handle, struct ibv_pd *pd, + void *addr, int access); + .B Address Handles struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); @@ -116,26 +117,10 @@ struct ibv_srq_attr *srq_attr, enum ibv_srq_attr_mask srq_attr_mask); int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); +struct ibv_srq *ibv_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex); +int ibv_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num); -.B eXtended Reliable Connection control - -struct ibv_xrc_domain *ibv_open_xrc_domain(struct ibv_context *context, - int fd, int oflag); -int ibv_close_xrc_domain(struct ibv_xrc_domain *d); -struct ibv_srq *ibv_create_xrc_srq(struct ibv_pd *pd, - struct ibv_xrc_domain *xrc_domain, - struct ibv_cq *xrc_cq, - struct ibv_srq_init_attr *srq_init_attr); -int ibv_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_rcv_qpn); -int ibv_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, int attr_mask); -int ibv_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, int attr_mask, - struct ibv_qp_init_attr *init_attr); -int ibv_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num); -int ibv_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num); - .B Queue Pair control struct ibv_qp *ibv_create_qp(struct ibv_pd *pd, @@ -146,6 +131,10 @@ int ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, enum ibv_qp_attr_mask attr_mask, struct ibv_qp_init_attr *init_attr); +struct ibv_qp *ibv_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex); +struct ibv_qp *ibv_open_qp(struct ibv_context *context, + struct ibv_qp_open_attr *qp_open_attr); .B posting Work Requests to QPs/SRQs int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, @@ -161,10 +150,32 @@ int ibv_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid); int ibv_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid); +.B XRC + +struct ibv_xrcd *ibv_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); +int ibv_close_xrcd(struct ibv_xrcd *xrcd); + +.B Flow steering + +struct ibv_flow *ibv_create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *flow); +int ibv_destroy_flow(struct ibv_flow *flow_id); + +.B Core-Direct + +int ibv_post_task(struct ibv_context *context, struct ibv_task *task, + struct ibv_task **bad_task); + .B General functions int ibv_rate_to_mult(enum ibv_rate rate); enum ibv_rate mult_to_ibv_rate(int mult); +int ibv_rate_to_mbps(enum ibv_rate rate); +enum ibv_rate mbps_to_ibv_rate(int mbps); +const char *ibv_event_type_str(enum ibv_event_type event); +const char *ibv_port_state_str(enum ibv_port_state port_state); +const char *ibv_node_type_str(enum ibv_node_type node_type); \fP .SH "SEE ALSO" .LP @@ -202,14 +213,6 @@ \fIibv_destroy_srq\fP(), \fIibv_modify_srq\fP(), \fIibv_query_srq\fP(), -\fIibv_open_xrc_domain\fP(), -\fIibv_close_xrc_domain\fP(), -\fIibv_create_xrc_srq\fP(), -\fIibv_create_xrc_rcv_qp\fP(), -\fIibv_modify_xrc_rcv_qp\fP(), -\fIibv_query_xrc_rcv_qp\fP(), -\fIibv_reg_xrc_rcv_qp\fP(), -\fIibv_unreg_xrc_rcv_qp\fP(), \fIibv_post_srq_recv\fP(), \fIibv_create_qp\fP(), \fIibv_destroy_qp\fP(), @@ -220,9 +223,13 @@ \fIibv_attach_mcast\fP(), \fIibv_detach_mcast\fP(), \fIibv_rate_to_mult\fP(), -\fImult_to_ibv_rate\fP() +\fImult_to_ibv_rate\fP(), +\fIibv_rate_to_mbps\fP(), +\fImbps_to_ibv_rate\fP() .SH "AUTHORS" .TP -Dotan Barak +Signed-off-by: Dotan Barak +.TP +Or Gerlitz .TP -Or Gerlitz +Yishai Hadas Index: contrib/ofed/libibverbs/src/cmd.c =================================================================== --- contrib/ofed/libibverbs/src/cmd.c +++ contrib/ofed/libibverbs/src/cmd.c @@ -45,50 +45,17 @@ #include "ibverbs.h" -static int ibv_cmd_get_context_v2(struct ibv_context *context, - struct ibv_get_context *new_cmd, - size_t new_cmd_size, - struct ibv_get_context_resp *resp, - size_t resp_size) -{ - struct ibv_abi_compat_v2 *t; - struct ibv_get_context_v2 *cmd; - size_t cmd_size; - uint32_t cq_fd; - - t = malloc(sizeof *t); - if (!t) - return ENOMEM; - pthread_mutex_init(&t->in_use, NULL); - - cmd_size = sizeof *cmd + new_cmd_size - sizeof *new_cmd; - cmd = alloca(cmd_size); - memcpy(cmd->driver_data, new_cmd->driver_data, new_cmd_size - sizeof *new_cmd); - - IBV_INIT_CMD_RESP(cmd, cmd_size, GET_CONTEXT, resp, resp_size); - cmd->cq_fd_tab = (uintptr_t) &cq_fd; - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return errno; - - VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - - context->async_fd = resp->async_fd; - context->num_comp_vectors = 1; - t->channel.context = context; - t->channel.fd = cq_fd; - t->channel.refcnt = 0; - context->abi_compat = t; - - return 0; -} +enum ibv_cmd_type { + IBV_CMD_BASIC, + IBV_CMD_EXTENDED +}; int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, size_t cmd_size, struct ibv_get_context_resp *resp, size_t resp_size) { if (abi_ver <= 2) - return ibv_cmd_get_context_v2(context, cmd, cmd_size, resp, resp_size); + return ENOSYS; IBV_INIT_CMD_RESP(cmd, cmd_size, GET_CONTEXT, resp, resp_size); @@ -103,6 +70,55 @@ return 0; } +void ibv_cmd_query_device_assign(struct ibv_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_query_device_resp *resp) +{ + memset(device_attr->fw_ver, 0, sizeof(device_attr->fw_ver)); + *raw_fw_ver = resp->fw_ver; + device_attr->node_guid = resp->node_guid; + device_attr->sys_image_guid = resp->sys_image_guid; + device_attr->max_mr_size = resp->max_mr_size; + device_attr->page_size_cap = resp->page_size_cap; + device_attr->vendor_id = resp->vendor_id; + device_attr->vendor_part_id = resp->vendor_part_id; + device_attr->hw_ver = resp->hw_ver; + device_attr->max_qp = resp->max_qp; + device_attr->max_qp_wr = resp->max_qp_wr; + device_attr->device_cap_flags = resp->device_cap_flags; + device_attr->max_sge = resp->max_sge; + device_attr->max_sge_rd = resp->max_sge_rd; + device_attr->max_cq = resp->max_cq; + device_attr->max_cqe = resp->max_cqe; + device_attr->max_mr = resp->max_mr; + device_attr->max_pd = resp->max_pd; + device_attr->max_qp_rd_atom = resp->max_qp_rd_atom; + device_attr->max_ee_rd_atom = resp->max_ee_rd_atom; + device_attr->max_res_rd_atom = resp->max_res_rd_atom; + device_attr->max_qp_init_rd_atom = resp->max_qp_init_rd_atom; + device_attr->max_ee_init_rd_atom = resp->max_ee_init_rd_atom; + device_attr->atomic_cap = resp->atomic_cap; + device_attr->max_ee = resp->max_ee; + device_attr->max_rdd = resp->max_rdd; + device_attr->max_mw = resp->max_mw; + device_attr->max_raw_ipv6_qp = resp->max_raw_ipv6_qp; + device_attr->max_raw_ethy_qp = resp->max_raw_ethy_qp; + device_attr->max_mcast_grp = resp->max_mcast_grp; + device_attr->max_mcast_qp_attach = resp->max_mcast_qp_attach; + device_attr->max_total_mcast_qp_attach = + resp->max_total_mcast_qp_attach; + device_attr->max_ah = resp->max_ah; + device_attr->max_fmr = resp->max_fmr; + device_attr->max_map_per_fmr = resp->max_map_per_fmr; + device_attr->max_srq = resp->max_srq; + device_attr->max_srq_wr = resp->max_srq_wr; + device_attr->max_srq_sge = resp->max_srq_sge; + device_attr->max_pkeys = resp->max_pkeys; + device_attr->local_ca_ack_delay = resp->local_ca_ack_delay; + device_attr->phys_port_cnt = resp->phys_port_cnt; +} + + int ibv_cmd_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr, uint64_t *raw_fw_ver, @@ -117,48 +133,7 @@ VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); - memset(device_attr->fw_ver, 0, sizeof device_attr->fw_ver); - *raw_fw_ver = resp.fw_ver; - device_attr->node_guid = resp.node_guid; - device_attr->sys_image_guid = resp.sys_image_guid; - device_attr->max_mr_size = resp.max_mr_size; - device_attr->page_size_cap = resp.page_size_cap; - device_attr->vendor_id = resp.vendor_id; - device_attr->vendor_part_id = resp.vendor_part_id; - device_attr->hw_ver = resp.hw_ver; - device_attr->max_qp = resp.max_qp; - device_attr->max_qp_wr = resp.max_qp_wr; - device_attr->device_cap_flags = resp.device_cap_flags; - device_attr->max_sge = resp.max_sge; - device_attr->max_sge_rd = resp.max_sge_rd; - device_attr->max_cq = resp.max_cq; - device_attr->max_cqe = resp.max_cqe; - device_attr->max_mr = resp.max_mr; - device_attr->max_pd = resp.max_pd; - device_attr->max_qp_rd_atom = resp.max_qp_rd_atom; - device_attr->max_ee_rd_atom = resp.max_ee_rd_atom; - device_attr->max_res_rd_atom = resp.max_res_rd_atom; - device_attr->max_qp_init_rd_atom = resp.max_qp_init_rd_atom; - device_attr->max_ee_init_rd_atom = resp.max_ee_init_rd_atom; - device_attr->atomic_cap = resp.atomic_cap; - device_attr->max_ee = resp.max_ee; - device_attr->max_rdd = resp.max_rdd; - device_attr->max_mw = resp.max_mw; - device_attr->max_raw_ipv6_qp = resp.max_raw_ipv6_qp; - device_attr->max_raw_ethy_qp = resp.max_raw_ethy_qp; - device_attr->max_mcast_grp = resp.max_mcast_grp; - device_attr->max_mcast_qp_attach = resp.max_mcast_qp_attach; - device_attr->max_total_mcast_qp_attach = resp.max_total_mcast_qp_attach; - device_attr->max_ah = resp.max_ah; - device_attr->max_fmr = resp.max_fmr; - device_attr->max_map_per_fmr = resp.max_map_per_fmr; - device_attr->max_srq = resp.max_srq; - device_attr->max_srq_wr = resp.max_srq_wr; - device_attr->max_srq_sge = resp.max_srq_sge; - device_attr->max_pkeys = resp.max_pkeys; - device_attr->local_ca_ack_delay = resp.local_ca_ack_delay; - device_attr->phys_port_cnt = resp.phys_port_cnt; - + ibv_cmd_query_device_assign(device_attr, raw_fw_ver, &resp); return 0; } @@ -231,6 +206,52 @@ return 0; } +int ibv_cmd_open_xrcd(struct ibv_context *context, struct verbs_xrcd *xrcd, + int vxrcd_size, + struct ibv_xrcd_init_attr *attr, + struct ibv_open_xrcd *cmd, size_t cmd_size, + struct ibv_open_xrcd_resp *resp, + size_t resp_size) +{ + IBV_INIT_CMD_RESP(cmd, cmd_size, OPEN_XRCD, resp, resp_size); + + if (attr->comp_mask >= IBV_XRCD_INIT_ATTR_RESERVED) + return ENOSYS; + + if (!(attr->comp_mask & IBV_XRCD_INIT_ATTR_FD) || + !(attr->comp_mask & IBV_XRCD_INIT_ATTR_OFLAGS)) + return EINVAL; + + cmd->fd = attr->fd; + cmd->oflags = attr->oflags; + if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + + xrcd->xrcd.context = context; + xrcd->comp_mask = 0; + if (vext_field_avail(struct verbs_xrcd, handle, vxrcd_size)) { + xrcd->comp_mask = VERBS_XRCD_HANDLE; + xrcd->handle = resp->xrcd_handle; + } + + return 0; +} + +int ibv_cmd_close_xrcd(struct verbs_xrcd *xrcd) +{ + struct ibv_close_xrcd cmd; + + IBV_INIT_CMD(&cmd, sizeof cmd, CLOSE_XRCD); + cmd.xrcd_handle = xrcd->handle; + + if (write(xrcd->xrcd.context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) + return errno; + + return 0; +} + int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access, struct ibv_mr *mr, struct ibv_reg_mr *cmd, @@ -272,31 +293,37 @@ return 0; } -static int ibv_cmd_create_cq_v2(struct ibv_context *context, int cqe, - struct ibv_cq *cq, - struct ibv_create_cq *new_cmd, size_t new_cmd_size, - struct ibv_create_cq_resp *resp, size_t resp_size) +int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type, + struct verbs_mw *mw, struct ibv_alloc_mw *cmd, + size_t cmd_size, + struct ibv_alloc_mw_resp *resp, size_t resp_size) { - struct ibv_create_cq_v2 *cmd; - size_t cmd_size; - - cmd_size = sizeof *cmd + new_cmd_size - sizeof *new_cmd; - cmd = alloca(cmd_size); - memcpy(cmd->driver_data, new_cmd->driver_data, new_cmd_size - sizeof *new_cmd); + IBV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_MW, resp, resp_size); + cmd->pd_handle = pd->handle; + cmd->mw_type = type; - IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_CQ, resp, resp_size); - cmd->user_handle = (uintptr_t) cq; - cmd->cqe = cqe; - cmd->event_handler = 0; - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) + if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - cq->handle = resp->cq_handle; - cq->cqe = resp->cqe; - cq->context = context; + mw->mw.context = pd->context; + mw->mw.pd = pd; + mw->mw.rkey = resp->rkey; + mw->handle = resp->mw_handle; + mw->type = type; + + return 0; +} + +int ibv_cmd_dealloc_mw(struct verbs_mw *mw, + struct ibv_dealloc_mw *cmd, size_t cmd_size) +{ + IBV_INIT_CMD(cmd, cmd_size, DEALLOC_MW); + cmd->mw_handle = mw->handle; + + if (write(mw->mw.context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; return 0; } @@ -307,10 +334,6 @@ struct ibv_create_cq *cmd, size_t cmd_size, struct ibv_create_cq_resp *resp, size_t resp_size) { - if (abi_ver <= 2) - return ibv_cmd_create_cq_v2(context, cqe, cq, - cmd, cmd_size, resp, resp_size); - IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_CQ, resp, resp_size); cmd->user_handle = (uintptr_t) cq; cmd->cqe = cqe; @@ -395,7 +418,6 @@ struct ibv_resize_cq *cmd, size_t cmd_size, struct ibv_resize_cq_resp *resp, size_t resp_size) { - IBV_INIT_CMD_RESP(cmd, cmd_size, RESIZE_CQ, resp, resp_size); cmd->cq_handle = cq->handle; cmd->cqe = cqe; @@ -410,27 +432,11 @@ return 0; } -static int ibv_cmd_destroy_cq_v1(struct ibv_cq *cq) -{ - struct ibv_destroy_cq_v1 cmd; - - IBV_INIT_CMD(&cmd, sizeof cmd, DESTROY_CQ); - cmd.cq_handle = cq->handle; - - if (write(cq->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; - - return 0; -} - int ibv_cmd_destroy_cq(struct ibv_cq *cq) { struct ibv_destroy_cq cmd; struct ibv_destroy_cq_resp resp; - if (abi_ver == 1) - return ibv_cmd_destroy_cq_v1(cq); - IBV_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_CQ, &resp, sizeof resp); cmd.cq_handle = cq->handle; cmd.reserved = 0; @@ -484,34 +490,84 @@ return 0; } -int ibv_cmd_create_xrc_srq(struct ibv_pd *pd, - struct ibv_srq *srq, struct ibv_srq_init_attr *attr, - uint32_t xrcd_handle, uint32_t xrc_cq, - struct ibv_create_xrc_srq *cmd, size_t cmd_size, - struct ibv_create_srq_resp *resp, size_t resp_size) +int ibv_cmd_create_srq_ex(struct ibv_context *context, + struct verbs_srq *srq, int vsrq_sz, + struct ibv_srq_init_attr_ex *attr_ex, + struct ibv_create_xsrq *cmd, size_t cmd_size, + struct ibv_create_srq_resp *resp, size_t resp_size) { - IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_XRC_SRQ, resp, resp_size); + struct verbs_xrcd *vxrcd = NULL; + + IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_XSRQ, resp, resp_size); + + if (attr_ex->comp_mask >= IBV_SRQ_INIT_ATTR_RESERVED) + return ENOSYS; + + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_PD)) + return EINVAL; + cmd->user_handle = (uintptr_t) srq; - cmd->pd_handle = pd->handle; - cmd->max_wr = attr->attr.max_wr; - cmd->max_sge = attr->attr.max_sge; - cmd->srq_limit = attr->attr.srq_limit; - cmd->xrcd_handle = xrcd_handle; - cmd->xrc_cq = xrc_cq; + cmd->pd_handle = attr_ex->pd->handle; + cmd->max_wr = attr_ex->attr.max_wr; + cmd->max_sge = attr_ex->attr.max_sge; + cmd->srq_limit = attr_ex->attr.srq_limit; + + cmd->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? + attr_ex->srq_type : IBV_SRQT_BASIC; + if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ)) + return EINVAL; + + vxrcd = container_of(attr_ex->xrcd, struct verbs_xrcd, xrcd); + cmd->xrcd_handle = vxrcd->handle; + cmd->cq_handle = attr_ex->cq->handle; + } - if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) + if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - srq->handle = resp->srq_handle; - srq->context = pd->context; - attr->attr.max_wr = resp->max_wr; - attr->attr.max_sge = resp->max_sge; + srq->srq.handle = resp->srq_handle; + srq->srq.context = context; + srq->srq.srq_context = attr_ex->srq_context; + srq->srq.pd = attr_ex->pd; + srq->srq.events_completed = 0; + pthread_mutex_init(&srq->srq.mutex, NULL); + pthread_cond_init(&srq->srq.cond, NULL); + + /* + * check that the last field is available. + * If it is than all the others exist as well + */ + if (vext_field_avail(struct verbs_srq, srq_num, vsrq_sz)) { + srq->comp_mask = IBV_SRQ_INIT_ATTR_TYPE; + srq->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? + attr_ex->srq_type : IBV_SRQT_BASIC; + + if (srq->srq_type == IBV_SRQT_XRC) { + srq->comp_mask |= VERBS_SRQ_NUM; + srq->srq_num = resp->srqn; + } + + if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { + srq->comp_mask |= VERBS_SRQ_XRCD; + srq->xrcd = vxrcd; + } + + if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) { + srq->comp_mask |= VERBS_SRQ_CQ; + srq->cq = attr_ex->cq; + } + } + + attr_ex->attr.max_wr = resp->max_wr; + attr_ex->attr.max_sge = resp->max_sge; return 0; } + static int ibv_cmd_modify_srq_v3(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask, @@ -583,27 +639,11 @@ return 0; } -static int ibv_cmd_destroy_srq_v1(struct ibv_srq *srq) -{ - struct ibv_destroy_srq_v1 cmd; - - IBV_INIT_CMD(&cmd, sizeof cmd, DESTROY_SRQ); - cmd.srq_handle = srq->handle; - - if (write(srq->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; - - return 0; -} - int ibv_cmd_destroy_srq(struct ibv_srq *srq) { struct ibv_destroy_srq cmd; struct ibv_destroy_srq_resp resp; - if (abi_ver == 1) - return ibv_cmd_destroy_srq_v1(srq); - IBV_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_SRQ, &resp, sizeof resp); cmd.srq_handle = srq->handle; cmd.reserved = 0; @@ -629,9 +669,10 @@ IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_QP, resp, resp_size); cmd->user_handle = (uintptr_t) qp; - cmd->pd_handle = pd->handle; + cmd->pd_handle = pd->handle; cmd->send_cq_handle = attr->send_cq->handle; cmd->recv_cq_handle = attr->recv_cq->handle; + cmd->srq_handle = attr->srq ? attr->srq->handle : 0; cmd->max_send_wr = attr->cap.max_send_wr; cmd->max_recv_wr = attr->cap.max_recv_wr; cmd->max_send_sge = attr->cap.max_send_sge; @@ -640,9 +681,6 @@ cmd->sq_sig_all = attr->sq_sig_all; cmd->qp_type = attr->qp_type; cmd->is_srq = !!attr->srq; - cmd->srq_handle = attr->qp_type == IBV_QPT_XRC ? - (attr->xrc_domain ? attr->xrc_domain->handle : 0) : - (attr->srq ? attr->srq->handle : 0); cmd->reserved = 0; if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) @@ -681,6 +719,56 @@ return 0; } +int ibv_cmd_open_qp(struct ibv_context *context, struct verbs_qp *qp, + int vqp_sz, + struct ibv_qp_open_attr *attr, + struct ibv_open_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size) +{ + struct verbs_xrcd *xrcd; + IBV_INIT_CMD_RESP(cmd, cmd_size, OPEN_QP, resp, resp_size); + + if (attr->comp_mask >= IBV_QP_OPEN_ATTR_RESERVED) + return ENOSYS; + + if (!(attr->comp_mask & IBV_QP_OPEN_ATTR_XRCD) || + !(attr->comp_mask & IBV_QP_OPEN_ATTR_NUM) || + !(attr->comp_mask & IBV_QP_OPEN_ATTR_TYPE)) + return EINVAL; + + xrcd = container_of(attr->xrcd, struct verbs_xrcd, xrcd); + cmd->user_handle = (uintptr_t) qp; + cmd->pd_handle = xrcd->handle; + cmd->qpn = attr->qp_num; + cmd->qp_type = attr->qp_type; + + if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + + qp->qp.handle = resp->qp_handle; + qp->qp.context = context; + qp->qp.qp_context = attr->qp_context; + qp->qp.pd = NULL; + qp->qp.send_cq = qp->qp.recv_cq = NULL; + qp->qp.srq = NULL; + qp->qp.qp_num = attr->qp_num; + qp->qp.qp_type = attr->qp_type; + qp->qp.state = IBV_QPS_UNKNOWN; + qp->qp.events_completed = 0; + pthread_mutex_init(&qp->qp.mutex, NULL); + pthread_cond_init(&qp->qp.cond, NULL); + + qp->comp_mask = 0; + if (vext_field_avail(struct verbs_qp, xrcd, vqp_sz)) { + qp->comp_mask |= VERBS_QP_XRCD; + qp->xrcd = xrcd; + } + + return 0; +} + int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr, @@ -753,14 +841,13 @@ init_attr->recv_cq = qp->recv_cq; init_attr->srq = qp->srq; init_attr->qp_type = qp->qp_type; - if (qp->qp_type == IBV_QPT_XRC) - init_attr->xrc_domain = qp->xrc_domain; init_attr->cap.max_send_wr = resp.max_send_wr; init_attr->cap.max_recv_wr = resp.max_recv_wr; init_attr->cap.max_send_sge = resp.max_send_sge; init_attr->cap.max_recv_sge = resp.max_recv_sge; init_attr->cap.max_inline_data = resp.max_inline_data; init_attr->sq_sig_all = resp.sq_sig_all; + qp->state = attr->cur_qp_state; return 0; } @@ -826,200 +913,8 @@ if (write(qp->context->cmd_fd, cmd, cmd_size) != cmd_size) return errno; - return 0; -} - -int ibv_cmd_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_rcv_qpn) -{ - struct ibv_create_xrc_rcv_qp cmd; - struct ibv_create_xrc_rcv_qp_resp resp; - - if (abi_ver < 6) - return ENOSYS; - - IBV_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_XRC_RCV_QP, &resp, - sizeof resp); - - cmd.xrc_domain_handle = init_attr->xrc_domain->handle; - cmd.max_send_wr = init_attr->cap.max_send_wr; - cmd.max_recv_wr = init_attr->cap.max_recv_wr; - cmd.max_send_sge = init_attr->cap.max_send_sge; - cmd.max_recv_sge = init_attr->cap.max_recv_sge; - cmd.max_inline_data = init_attr->cap.max_inline_data; - cmd.sq_sig_all = init_attr->sq_sig_all; - cmd.qp_type = init_attr->qp_type; - cmd.reserved[0] = cmd.reserved[1] = 0; - - if (write(init_attr->xrc_domain->context->cmd_fd, &cmd, sizeof cmd) != - sizeof cmd) - return errno; - - *xrc_rcv_qpn = resp.qpn; - - return 0; -} - -int ibv_cmd_modify_xrc_rcv_qp(struct ibv_xrc_domain *d, uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, int attr_mask) -{ - struct ibv_modify_xrc_rcv_qp cmd; - - if (abi_ver < 6) - return ENOSYS; - - IBV_INIT_CMD(&cmd, sizeof cmd, MODIFY_XRC_RCV_QP); - - cmd.xrc_domain_handle = d->handle; - cmd.qp_num = xrc_qp_num; - cmd.attr_mask = attr_mask; - cmd.qkey = attr->qkey; - cmd.rq_psn = attr->rq_psn; - cmd.sq_psn = attr->sq_psn; - cmd.dest_qp_num = attr->dest_qp_num; - cmd.qp_access_flags = attr->qp_access_flags; - cmd.pkey_index = attr->pkey_index; - cmd.alt_pkey_index = attr->alt_pkey_index; - cmd.qp_state = attr->qp_state; - cmd.cur_qp_state = attr->cur_qp_state; - cmd.path_mtu = attr->path_mtu; - cmd.path_mig_state = attr->path_mig_state; - cmd.en_sqd_async_notify = attr->en_sqd_async_notify; - cmd.max_rd_atomic = attr->max_rd_atomic; - cmd.max_dest_rd_atomic = attr->max_dest_rd_atomic; - cmd.min_rnr_timer = attr->min_rnr_timer; - cmd.port_num = attr->port_num; - cmd.timeout = attr->timeout; - cmd.retry_cnt = attr->retry_cnt; - cmd.rnr_retry = attr->rnr_retry; - cmd.alt_port_num = attr->alt_port_num; - cmd.alt_timeout = attr->alt_timeout; - - memcpy(cmd.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); - cmd.dest.flow_label = attr->ah_attr.grh.flow_label; - cmd.dest.dlid = attr->ah_attr.dlid; - cmd.dest.reserved = 0; - cmd.dest.sgid_index = attr->ah_attr.grh.sgid_index; - cmd.dest.hop_limit = attr->ah_attr.grh.hop_limit; - cmd.dest.traffic_class = attr->ah_attr.grh.traffic_class; - cmd.dest.sl = attr->ah_attr.sl; - cmd.dest.src_path_bits = attr->ah_attr.src_path_bits; - cmd.dest.static_rate = attr->ah_attr.static_rate; - cmd.dest.is_global = attr->ah_attr.is_global; - cmd.dest.port_num = attr->ah_attr.port_num; - - memcpy(cmd.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); - cmd.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; - cmd.alt_dest.dlid = attr->alt_ah_attr.dlid; - cmd.alt_dest.reserved = 0; - cmd.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; - cmd.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; - cmd.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; - cmd.alt_dest.sl = attr->alt_ah_attr.sl; - cmd.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; - cmd.alt_dest.static_rate = attr->alt_ah_attr.static_rate; - cmd.alt_dest.is_global = attr->alt_ah_attr.is_global; - cmd.alt_dest.port_num = attr->alt_ah_attr.port_num; - - cmd.reserved[0] = cmd.reserved[1] = 0; - - if (write(d->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; - - return 0; -} - -int ibv_cmd_query_xrc_rcv_qp(struct ibv_xrc_domain *d, uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, int attr_mask, - struct ibv_qp_init_attr *init_attr) -{ - struct ibv_query_xrc_rcv_qp cmd; - struct ibv_query_qp_resp resp; - - if (abi_ver < 6) - return ENOSYS; - - IBV_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_XRC_RCV_QP, &resp, - sizeof resp); - cmd.xrc_domain_handle = d->handle; - cmd.qp_num = xrc_qp_num; - cmd.attr_mask = attr_mask; - - if (write(d->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); - - attr->qkey = resp.qkey; - attr->rq_psn = resp.rq_psn; - attr->sq_psn = resp.sq_psn; - attr->dest_qp_num = resp.dest_qp_num; - attr->qp_access_flags = resp.qp_access_flags; - attr->pkey_index = resp.pkey_index; - attr->alt_pkey_index = resp.alt_pkey_index; - attr->qp_state = resp.qp_state; - attr->cur_qp_state = resp.cur_qp_state; - attr->path_mtu = resp.path_mtu; - attr->path_mig_state = resp.path_mig_state; - attr->sq_draining = resp.sq_draining; - attr->max_rd_atomic = resp.max_rd_atomic; - attr->max_dest_rd_atomic = resp.max_dest_rd_atomic; - attr->min_rnr_timer = resp.min_rnr_timer; - attr->port_num = resp.port_num; - attr->timeout = resp.timeout; - attr->retry_cnt = resp.retry_cnt; - attr->rnr_retry = resp.rnr_retry; - attr->alt_port_num = resp.alt_port_num; - attr->alt_timeout = resp.alt_timeout; - attr->cap.max_send_wr = resp.max_send_wr; - attr->cap.max_recv_wr = resp.max_recv_wr; - attr->cap.max_send_sge = resp.max_send_sge; - attr->cap.max_recv_sge = resp.max_recv_sge; - attr->cap.max_inline_data = resp.max_inline_data; - - memcpy(attr->ah_attr.grh.dgid.raw, resp.dest.dgid, 16); - attr->ah_attr.grh.flow_label = resp.dest.flow_label; - attr->ah_attr.dlid = resp.dest.dlid; - attr->ah_attr.grh.sgid_index = resp.dest.sgid_index; - attr->ah_attr.grh.hop_limit = resp.dest.hop_limit; - attr->ah_attr.grh.traffic_class = resp.dest.traffic_class; - attr->ah_attr.sl = resp.dest.sl; - attr->ah_attr.src_path_bits = resp.dest.src_path_bits; - attr->ah_attr.static_rate = resp.dest.static_rate; - attr->ah_attr.is_global = resp.dest.is_global; - attr->ah_attr.port_num = resp.dest.port_num; - - memcpy(attr->alt_ah_attr.grh.dgid.raw, resp.alt_dest.dgid, 16); - attr->alt_ah_attr.grh.flow_label = resp.alt_dest.flow_label; - attr->alt_ah_attr.dlid = resp.alt_dest.dlid; - attr->alt_ah_attr.grh.sgid_index = resp.alt_dest.sgid_index; - attr->alt_ah_attr.grh.hop_limit = resp.alt_dest.hop_limit; - attr->alt_ah_attr.grh.traffic_class = resp.alt_dest.traffic_class; - attr->alt_ah_attr.sl = resp.alt_dest.sl; - attr->alt_ah_attr.src_path_bits = resp.alt_dest.src_path_bits; - attr->alt_ah_attr.static_rate = resp.alt_dest.static_rate; - attr->alt_ah_attr.is_global = resp.alt_dest.is_global; - attr->alt_ah_attr.port_num = resp.alt_dest.port_num; - - init_attr->cap.max_send_wr = resp.max_send_wr; - init_attr->cap.max_recv_wr = resp.max_recv_wr; - init_attr->cap.max_send_sge = resp.max_send_sge; - init_attr->cap.max_recv_sge = resp.max_recv_sge; - init_attr->cap.max_inline_data = resp.max_inline_data; - init_attr->sq_sig_all = resp.sq_sig_all; - - return 0; -} - -static int ibv_cmd_destroy_qp_v1(struct ibv_qp *qp) -{ - struct ibv_destroy_qp_v1 cmd; - - IBV_INIT_CMD(&cmd, sizeof cmd, DESTROY_QP); - cmd.qp_handle = qp->handle; - - if (write(qp->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; + if (attr_mask & IBV_QP_STATE) + qp->state = attr->qp_state; return 0; } @@ -1255,6 +1150,7 @@ cmd.attr.grh.sgid_index = attr->grh.sgid_index; cmd.attr.grh.hop_limit = attr->grh.hop_limit; cmd.attr.grh.traffic_class = attr->grh.traffic_class; + cmd.reserved = 0; memcpy(cmd.attr.grh.dgid, attr->grh.dgid.raw, 16); if (write(pd->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) @@ -1286,9 +1182,6 @@ struct ibv_destroy_qp cmd; struct ibv_destroy_qp_resp resp; - if (abi_ver == 1) - return ibv_cmd_destroy_qp_v1(qp); - IBV_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_QP, &resp, sizeof resp); cmd.qp_handle = qp->handle; cmd.reserved = 0; @@ -1338,72 +1231,200 @@ return 0; } -int ibv_cmd_open_xrc_domain(struct ibv_context *context, int fd, int oflag, - struct ibv_xrc_domain *d, - struct ibv_open_xrc_domain_resp *resp, - size_t resp_size) +static int ib_spec_to_kern_spec(struct ibv_exp_flow_spec *ib_spec, + struct ibv_exp_kern_spec *kern_spec, + int is_exp) { - struct ibv_open_xrc_domain cmd; + kern_spec->hdr.type = ib_spec->hdr.type; + + switch (ib_spec->hdr.type) { + case IBV_EXP_FLOW_SPEC_ETH: + kern_spec->eth.size = sizeof(struct ibv_kern_spec_eth); + memcpy(&kern_spec->eth.val, &ib_spec->eth.val, + sizeof(struct ibv_exp_flow_eth_filter)); + memcpy(&kern_spec->eth.mask, &ib_spec->eth.mask, + sizeof(struct ibv_exp_flow_eth_filter)); + break; + case IBV_EXP_FLOW_SPEC_IB: + if (!is_exp) + return EINVAL; + kern_spec->ib.size = sizeof(struct ibv_kern_spec_ib); + memcpy(&kern_spec->ib.val, &ib_spec->ib.val, + sizeof(struct ibv_exp_flow_ib_filter)); + memcpy(&kern_spec->ib.mask, &ib_spec->ib.mask, + sizeof(struct ibv_exp_flow_ib_filter)); + break; + case IBV_EXP_FLOW_SPEC_IPV4: + kern_spec->ipv4.size = sizeof(struct ibv_kern_spec_ipv4); + memcpy(&kern_spec->ipv4.val, &ib_spec->ipv4.val, + sizeof(struct ibv_exp_flow_ipv4_filter)); + memcpy(&kern_spec->ipv4.mask, &ib_spec->ipv4.mask, + sizeof(struct ibv_exp_flow_ipv4_filter)); + break; + case IBV_EXP_FLOW_SPEC_IPV6: + if (!is_exp) + return EINVAL; + kern_spec->ipv6.size = sizeof(struct ibv_exp_kern_spec_ipv6); + memcpy(&kern_spec->ipv6.val, &ib_spec->ipv6.val, + sizeof(struct ibv_exp_flow_ipv6_filter)); + memcpy(&kern_spec->ipv6.mask, &ib_spec->ipv6.mask, + sizeof(struct ibv_exp_flow_ipv6_filter)); + break; + case IBV_EXP_FLOW_SPEC_TCP: + case IBV_EXP_FLOW_SPEC_UDP: + kern_spec->tcp_udp.size = sizeof(struct ibv_kern_spec_tcp_udp); + memcpy(&kern_spec->tcp_udp.val, &ib_spec->tcp_udp.val, + sizeof(struct ibv_exp_flow_tcp_udp_filter)); + memcpy(&kern_spec->tcp_udp.mask, &ib_spec->tcp_udp.mask, + sizeof(struct ibv_exp_flow_tcp_udp_filter)); + break; + default: + return EINVAL; + } + return 0; +} - if (abi_ver < 6) - return ENOSYS; +static int flow_is_exp(struct ibv_exp_flow_attr *flow_attr) +{ + int i; + void *ib_spec = flow_attr + 1; + + for (i = 0; i < flow_attr->num_of_specs; i++) { + if (((struct ibv_exp_flow_spec *)ib_spec)->hdr.type == + IBV_EXP_FLOW_SPEC_IPV6) + return 1; + ib_spec += ((struct ibv_exp_flow_spec *)ib_spec)->hdr.size; + } - IBV_INIT_CMD_RESP(&cmd, sizeof cmd, OPEN_XRC_DOMAIN, resp, resp_size); - cmd.fd = fd; - cmd.oflags = oflag; + return 0; +} - if (write(context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; +static struct ibv_flow *cmd_create_flow(struct ibv_qp *qp, + struct ibv_exp_flow_attr *flow_attr, + void *ib_spec, + int is_exp) +{ + struct ibv_create_flow *cmd; + struct ibv_create_flow_resp resp; + struct ibv_flow *flow_id; + size_t cmd_size; + size_t written_size; + int i, err = 0; + void *kern_spec; + int exp_flow = flow_is_exp(flow_attr); + size_t spec_size; - d->handle = resp->xrcd_handle; + spec_size = exp_flow ? sizeof(struct ibv_kern_spec) : + sizeof(struct ibv_exp_kern_spec); - return 0; + cmd_size = sizeof(*cmd) + (flow_attr->num_of_specs * spec_size); + + cmd = alloca(cmd_size); + flow_id = calloc(1, sizeof(*flow_id)); + if (!flow_id) + return NULL; + memset(cmd, 0, cmd_size); + + cmd->qp_handle = qp->handle; + + cmd->flow_attr.type = flow_attr->type; + cmd->flow_attr.priority = flow_attr->priority; + cmd->flow_attr.num_of_specs = flow_attr->num_of_specs; + cmd->flow_attr.port = flow_attr->port; + cmd->flow_attr.flags = flow_attr->flags; + + kern_spec = cmd + 1; + for (i = 0; i < flow_attr->num_of_specs; i++) { + err = ib_spec_to_kern_spec(ib_spec, kern_spec, is_exp); + if (err) { + errno = err; + goto err; + } + cmd->flow_attr.size += + ((struct ibv_kern_spec *)kern_spec)->hdr.size; + kern_spec += ((struct ibv_kern_spec *)kern_spec)->hdr.size; + ib_spec += ((struct ibv_exp_flow_spec *)ib_spec)->hdr.size; + } + + written_size = sizeof(*cmd) + cmd->flow_attr.size; + if (!exp_flow) + IBV_INIT_CMD_RESP_EX_VCMD(cmd, written_size, written_size, + CREATE_FLOW, &resp, sizeof(resp)); + else + IBV_INIT_CMD_RESP_EXP(CREATE_FLOW, cmd, written_size, 0, + &resp, sizeof(resp), 0); + + if (write(qp->context->cmd_fd, cmd, written_size) != written_size) + goto err; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); + + flow_id->context = qp->context; + flow_id->handle = resp.flow_handle; + return flow_id; +err: + free(flow_id); + return NULL; } -int ibv_cmd_close_xrc_domain(struct ibv_xrc_domain *d) +struct ibv_exp_flow *ibv_exp_cmd_create_flow(struct ibv_qp *qp, + struct ibv_exp_flow_attr *flow_attr) { - struct ibv_close_xrc_domain cmd; + void *ib_spec = flow_attr + 1; + struct ibv_flow *fl; - if (abi_ver < 6) - return ENOSYS; + fl = cmd_create_flow(qp, flow_attr, ib_spec, 1); - IBV_INIT_CMD(&cmd, sizeof cmd, CLOSE_XRC_DOMAIN); - cmd.xrcd_handle = d->handle; + if (fl) + return (struct ibv_exp_flow *)&fl->context; - if (write(d->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; - return 0; + return NULL; } -int ibv_cmd_reg_xrc_rcv_qp(struct ibv_xrc_domain *d, uint32_t xrc_qp_num) +struct ibv_flow *ibv_cmd_create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr) { - struct ibv_reg_xrc_rcv_qp cmd; + void *ib_spec = flow_attr + 1; - if (abi_ver < 6) - return ENOSYS; - - IBV_INIT_CMD(&cmd, sizeof cmd, REG_XRC_RCV_QP); - cmd.xrc_domain_handle = d->handle; - cmd.qp_num = xrc_qp_num; + if (flow_attr->comp_mask) { + errno = EINVAL; + return NULL; + } - if (write(d->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; - return 0; + return cmd_create_flow(qp, (struct ibv_exp_flow_attr *)&flow_attr->type, + ib_spec, 0); } -int ibv_cmd_unreg_xrc_rcv_qp(struct ibv_xrc_domain *d, uint32_t xrc_qp_num) +static int cmd_destroy_flow(uint32_t handle, int cmd_fd) { - struct ibv_unreg_xrc_rcv_qp cmd; + struct ibv_destroy_flow cmd; + int ret = 0; - if (abi_ver < 6) - return ENOSYS; + memset(&cmd, 0, sizeof(cmd)); + IBV_INIT_CMD_EX(&cmd, sizeof(cmd), DESTROY_FLOW); + cmd.flow_handle = handle; - IBV_INIT_CMD(&cmd, sizeof cmd, UNREG_XRC_RCV_QP); - cmd.xrc_domain_handle = d->handle; - cmd.qp_num = xrc_qp_num; + if (write(cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) + ret = errno; - if (write(d->context->cmd_fd, &cmd, sizeof cmd) != sizeof cmd) - return errno; - return 0; + return ret; } +int ibv_exp_cmd_destroy_flow(struct ibv_exp_flow *flow_id) +{ + int ret = cmd_destroy_flow(flow_id->handle, flow_id->context->cmd_fd); + struct ibv_flow *fl = (void *)flow_id - offsetof(struct ibv_flow, context); + + free(fl); + + return ret; +} + +int ibv_cmd_destroy_flow(struct ibv_flow *flow_id) +{ + int ret = cmd_destroy_flow(flow_id->handle, flow_id->context->cmd_fd); + + free(flow_id); + + return ret; +} Index: contrib/ofed/libibverbs/src/cmd_exp.c =================================================================== --- /dev/null +++ contrib/ofed/libibverbs/src/cmd_exp.c @@ -0,0 +1,1004 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include + +#include "ibverbs.h" + + +/* + * cmd.c experimental functions + */ +int ibv_exp_cmd_query_device(struct ibv_context *context, + struct ibv_exp_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_exp_query_device *cmd, size_t cmd_size) +{ + struct ibv_exp_query_device_resp resp; + struct ibv_query_device_resp *r_resp; + uint32_t comp_mask = 0; + + memset(&resp, 0, sizeof(resp)); + r_resp = IBV_RESP_TO_VERBS_RESP_EX(&resp, + struct ibv_exp_query_device_resp, + struct ibv_query_device_resp); + + memset(cmd, 0, sizeof(*cmd)); + cmd->comp_mask = device_attr->comp_mask; + IBV_INIT_CMD_RESP_EXP(QUERY_DEVICE, cmd, cmd_size, 0, + &resp, sizeof(resp), 0); + if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); + + ibv_cmd_query_device_assign((struct ibv_device_attr *)device_attr, + raw_fw_ver, r_resp); + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK)) { + device_attr->timestamp_mask = resp.timestamp_mask; + comp_mask |= IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK)) { + device_attr->hca_core_clock = resp.hca_core_clock; + comp_mask |= IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS)) { + device_attr->exp_device_cap_flags = ((struct ibv_device_attr *)device_attr)->device_cap_flags; + device_attr->exp_device_cap_flags |= resp.device_cap_flags2 << IBV_EXP_START_FLAG_LOC; + comp_mask |= IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_DC_RD_REQ) && + (resp.comp_mask & IBV_EXP_DEVICE_DC_RD_REQ)) { + device_attr->max_dc_req_rd_atom = resp.dc_rd_req; + comp_mask |= IBV_EXP_DEVICE_DC_RD_REQ; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_DC_RD_RES) && + (resp.comp_mask & IBV_EXP_DEVICE_DC_RD_RES)) { + device_attr->max_dc_res_rd_atom = resp.dc_rd_res; + comp_mask |= IBV_EXP_DEVICE_DC_RD_RES; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_MAX_DCT) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_DCT)) { + device_attr->max_dct = resp.max_dct; + comp_mask |= IBV_EXP_DEVICE_ATTR_MAX_DCT; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ)) { + device_attr->inline_recv_sz = resp.inline_recv_sz; + comp_mask |= IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ)) { + device_attr->max_rss_tbl_sz = resp.max_rss_tbl_sz; + comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS)) { + comp_mask |= IBV_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS; + device_attr->ext_atom.log_atomic_arg_sizes = resp.log_atomic_arg_sizes; + device_attr->ext_atom.max_fa_bit_boundary = resp.max_fa_bit_boundary; + device_attr->ext_atom.log_max_atomic_inline = resp.log_max_atomic_inline; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_UMR) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_UMR)) { + device_attr->umr_caps.max_klm_list_size = resp.umr_caps.max_klm_list_size; + device_attr->umr_caps.max_send_wqe_inline_klms = resp.umr_caps.max_send_wqe_inline_klms; + device_attr->umr_caps.max_umr_recursion_depth = resp.umr_caps.max_umr_recursion_depth; + device_attr->umr_caps.max_umr_stride_dimension = resp.umr_caps.max_umr_stride_dimension; + comp_mask |= IBV_EXP_DEVICE_ATTR_UMR; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_ODP) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_ODP)) { + device_attr->odp_caps.general_odp_caps = resp.odp_caps.general_odp_caps; + device_attr->odp_caps.per_transport_caps.rc_odp_caps = + resp.odp_caps.per_transport_caps.rc_odp_caps; + device_attr->odp_caps.per_transport_caps.uc_odp_caps = + resp.odp_caps.per_transport_caps.uc_odp_caps; + device_attr->odp_caps.per_transport_caps.ud_odp_caps = + resp.odp_caps.per_transport_caps.ud_odp_caps; + device_attr->odp_caps.per_transport_caps.dc_odp_caps = + resp.odp_caps.per_transport_caps.dc_odp_caps; + device_attr->odp_caps.per_transport_caps.xrc_odp_caps = + resp.odp_caps.per_transport_caps.xrc_odp_caps; + device_attr->odp_caps.per_transport_caps.raw_eth_odp_caps = + resp.odp_caps.per_transport_caps.raw_eth_odp_caps; + comp_mask |= IBV_EXP_DEVICE_ATTR_ODP; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN)) { + device_attr->max_ctx_res_domain = resp.max_ctx_res_domain; + comp_mask |= IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ)) { + device_attr->max_wq_type_rq = resp.max_wq_type_rq; + comp_mask |= IBV_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_RX_HASH) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_RX_HASH)) { + device_attr->rx_hash_caps.max_rwq_indirection_tables = resp.rx_hash.max_rwq_indirection_tables; + device_attr->rx_hash_caps.max_rwq_indirection_table_size = resp.rx_hash.max_rwq_indirection_table_size; + device_attr->rx_hash_caps.supported_hash_functions = resp.rx_hash.supported_hash_functions; + device_attr->rx_hash_caps.supported_packet_fields = resp.rx_hash.supported_packet_fields; + device_attr->rx_hash_caps.supported_qps = resp.rx_hash.supported_qps; + comp_mask |= IBV_EXP_DEVICE_ATTR_RX_HASH; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_MAX_DEVICE_CTX) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_DEVICE_CTX)) { + device_attr->max_device_ctx = resp.max_device_ctx; + comp_mask |= IBV_EXP_DEVICE_ATTR_MAX_DEVICE_CTX; + } + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_MP_RQ) && + (resp.comp_mask & IBV_EXP_DEVICE_ATTR_MP_RQ)) { + device_attr->mp_rq_caps.allowed_shifts = resp.mp_rq_caps.allowed_shifts; + device_attr->mp_rq_caps.supported_qps = resp.mp_rq_caps.supported_qps; + device_attr->mp_rq_caps.max_single_stride_log_num_of_bytes = resp.mp_rq_caps.max_single_stride_log_num_of_bytes; + device_attr->mp_rq_caps.min_single_stride_log_num_of_bytes = resp.mp_rq_caps.min_single_stride_log_num_of_bytes; + device_attr->mp_rq_caps.max_single_wqe_log_num_of_strides = resp.mp_rq_caps.max_single_wqe_log_num_of_strides; + device_attr->mp_rq_caps.min_single_wqe_log_num_of_strides = resp.mp_rq_caps.min_single_wqe_log_num_of_strides; + comp_mask |= IBV_EXP_DEVICE_ATTR_MP_RQ; + } + + device_attr->comp_mask = comp_mask; + + return 0; +} + +int ibv_exp_cmd_create_qp(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_exp_qp_init_attr *attr_exp, + void *cmd_buf, size_t lib_cmd_size, size_t drv_cmd_size, + void *resp_buf, size_t lib_resp_size, size_t drv_resp_size, + int force_exp) +{ + struct verbs_xrcd *vxrcd = NULL; + struct ibv_exp_create_qp *cmd_exp = NULL; + struct ibv_exp_create_qp_resp *resp_exp = NULL; + struct ibv_create_qp *cmd; + struct ibv_create_qp_resp *resp; + int wsize; + + if (attr_exp->comp_mask >= IBV_EXP_QP_INIT_ATTR_RESERVED1) + return ENOSYS; + + cmd = cmd_buf; + resp = resp_buf; + + if (attr_exp->comp_mask >= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS || force_exp) { + cmd_exp = cmd_buf; + resp_exp = resp_buf; + wsize = lib_cmd_size + drv_cmd_size; + + /* + * Cast extended command to legacy command using a fact + * that legacy header size is equal 'comp_mask' field size + * and 'comp_mask' field position is on top of the valuable + * fields + */ + cmd = (struct ibv_create_qp *)((void *)&cmd_exp->comp_mask - sizeof(cmd->response)); + /* + * Cast extended response to legacy response using a fact + * that 'comp_mask' field is added on top of legacy response + */ + resp = (struct ibv_create_qp_resp *) + ((uint8_t *)resp_exp + + sizeof(resp_exp->comp_mask)); + + IBV_INIT_CMD_RESP_EXP(CREATE_QP, cmd_exp, lib_cmd_size, drv_cmd_size, + resp_exp, lib_resp_size, drv_resp_size); + } else { + wsize = lib_cmd_size + drv_cmd_size; + IBV_INIT_CMD_RESP(cmd, wsize, CREATE_QP, resp, lib_resp_size + drv_resp_size); + } + + cmd->user_handle = (uintptr_t) qp; + + if (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_XRCD) { + /* XRC reciever side */ + vxrcd = container_of(attr_exp->xrcd, struct verbs_xrcd, xrcd); + cmd->pd_handle = vxrcd->handle; + } else { + if (!(attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_PD)) + return EINVAL; + + cmd->pd_handle = attr_exp->pd->handle; + if (!(attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_RX_HASH)) + cmd->send_cq_handle = attr_exp->send_cq->handle; + /* XRC sender doesn't have a recieve cq */ + if (attr_exp->qp_type != IBV_QPT_XRC_SEND && + attr_exp->qp_type != IBV_QPT_XRC && + attr_exp->qp_type != IBV_EXP_QPT_DC_INI && + !(attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_RX_HASH)) { + cmd->recv_cq_handle = attr_exp->recv_cq->handle; + cmd->srq_handle = attr_exp->srq ? attr_exp->srq->handle : 0; + } + } + + cmd->max_send_wr = attr_exp->cap.max_send_wr; + cmd->max_recv_wr = attr_exp->cap.max_recv_wr; + cmd->max_send_sge = attr_exp->cap.max_send_sge; + cmd->max_recv_sge = attr_exp->cap.max_recv_sge; + cmd->max_inline_data = attr_exp->cap.max_inline_data; + cmd->sq_sig_all = attr_exp->sq_sig_all; + cmd->qp_type = (attr_exp->qp_type == IBV_QPT_XRC) ? + IBV_QPT_XRC_SEND : attr_exp->qp_type; + cmd->is_srq = !!attr_exp->srq; + cmd->reserved = 0; + + if (cmd_exp) { + cmd_exp->comp_mask = 0; + if (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS) { + if (attr_exp->exp_create_flags & ~IBV_EXP_QP_CREATE_MASK) + return EINVAL; + else { + cmd_exp->comp_mask |= IBV_CREATE_QP_EX_CAP_FLAGS; + cmd_exp->qp_cap_flags = attr_exp->exp_create_flags & + IBV_EXP_CREATE_QP_KERNEL_FLAGS; + } + } + if (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) { + cmd_exp->comp_mask |= IBV_EXP_CREATE_QP_INL_RECV; + cmd_exp->max_inl_recv = attr_exp->max_inl_recv; + } + if (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_QPG) { + struct ibv_exp_qpg *qpg = &attr_exp->qpg; + + switch (qpg->qpg_type) { + case IBV_EXP_QPG_PARENT: + cmd_exp->qpg.parent_attrib.rss_child_count = + qpg->parent_attrib.rss_child_count; + cmd_exp->qpg.parent_attrib.tss_child_count = + qpg->parent_attrib.tss_child_count; + break; + case IBV_EXP_QPG_CHILD_RX: + case IBV_EXP_QPG_CHILD_TX: + cmd_exp->qpg.parent_handle = + qpg->qpg_parent->handle; + break; + default: + return -EINVAL; + } + cmd_exp->qpg.qpg_type = qpg->qpg_type; + /* request a QP group */ + cmd_exp->comp_mask |= IBV_EXP_CREATE_QP_QPG; + } + + if (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS) { + cmd_exp->max_inl_send_klms = attr_exp->max_inl_send_klms; + cmd_exp->comp_mask |= IBV_EXP_CREATE_QP_MAX_INL_KLMS; + } + if (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_RX_HASH) { + if (attr_exp->rx_hash_conf->rx_hash_key_len > sizeof(cmd_exp->rx_hash_info.rx_hash_key)) + return -EINVAL; + + cmd_exp->rx_hash_info.rx_hash_function = attr_exp->rx_hash_conf->rx_hash_function; + cmd_exp->rx_hash_info.rx_hash_key_len = attr_exp->rx_hash_conf->rx_hash_key_len; + cmd_exp->rx_hash_info.rx_hash_fields_mask = attr_exp->rx_hash_conf->rx_hash_fields_mask; + memcpy(cmd_exp->rx_hash_info.rx_hash_key, attr_exp->rx_hash_conf->rx_hash_key, + attr_exp->rx_hash_conf->rx_hash_key_len); + cmd_exp->rx_hash_info.rwq_ind_tbl_handle = attr_exp->rx_hash_conf->rwq_ind_tbl->ind_tbl_handle; + cmd_exp->rx_hash_info.reserved = 0; + /* no comp mask explicit bit is needed, hash function is used as an indicator */ + } + if (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_PORT) + cmd_exp->port_num = attr_exp->port_num; + + memset(cmd_exp->reserved_2, 0, sizeof(cmd_exp->reserved_2)); + } + if (write(context->cmd_fd, cmd_buf, wsize) != wsize) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp_buf, lib_resp_size + drv_resp_size); + + if (abi_ver > 3) { + attr_exp->cap.max_recv_sge = resp->max_recv_sge; + attr_exp->cap.max_send_sge = resp->max_send_sge; + attr_exp->cap.max_recv_wr = resp->max_recv_wr; + attr_exp->cap.max_send_wr = resp->max_send_wr; + attr_exp->cap.max_inline_data = resp->max_inline_data; + if (resp_exp) { + attr_exp->comp_mask &= IBV_EXP_QP_INIT_ATTR_RESERVED1 - 1; + if ((resp_exp->comp_mask & IBV_EXP_CREATE_QP_RESP_INL_RECV) && + (attr_exp->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV)) + attr_exp->max_inl_recv = resp_exp->max_inl_recv; + else + attr_exp->comp_mask &= ~IBV_EXP_QP_INIT_ATTR_INL_RECV; + } + } + + if (abi_ver == 4) { + struct ibv_create_qp_resp_v4 *resp_v4 = + (struct ibv_create_qp_resp_v4 *) resp; + + memmove((void *) resp + sizeof(*resp), + (void *) resp_v4 + sizeof(*resp_v4), + lib_resp_size - sizeof(*resp)); + } else if (abi_ver <= 3) { + struct ibv_create_qp_resp_v3 *resp_v3 = + (struct ibv_create_qp_resp_v3 *) resp; + + memmove((void *) resp + sizeof(*resp), + (void *) resp_v3 + sizeof(*resp_v3), + lib_resp_size - sizeof(*resp)); + } + + qp->qp.handle = resp->qp_handle; + qp->qp.qp_num = resp->qpn; + qp->qp.context = context; + qp->qp.qp_context = attr_exp->qp_context; + qp->qp.pd = attr_exp->pd; + qp->qp.send_cq = attr_exp->send_cq; + qp->qp.recv_cq = attr_exp->recv_cq; + qp->qp.srq = attr_exp->srq; + qp->qp.qp_type = attr_exp->qp_type; + qp->qp.state = IBV_QPS_RESET; + qp->qp.events_completed = 0; + pthread_mutex_init(&qp->qp.mutex, NULL); + pthread_cond_init(&qp->qp.cond, NULL); + + qp->comp_mask = 0; + if (vext_field_avail(struct verbs_qp, xrcd, vqp_sz) && + (attr_exp->comp_mask & IBV_QP_INIT_ATTR_XRCD)) { + qp->comp_mask |= VERBS_QP_XRCD; + qp->xrcd = vxrcd; + } + + return 0; +} + +int ibv_exp_cmd_create_dct(struct ibv_context *context, + struct ibv_exp_dct *dct, + struct ibv_exp_dct_init_attr *attr, + struct ibv_exp_create_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_create_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz) +{ + int wsize = lib_cmd_sz + drv_cmd_sz; + + IBV_INIT_CMD_RESP_EXP(CREATE_DCT, cmd, lib_cmd_sz, drv_cmd_sz, resp, + lib_resp_sz, drv_resp_sz); + + cmd->user_handle = (__u64)(uintptr_t)dct; + cmd->pd_handle = attr->pd->handle; + cmd->cq_handle = attr->cq->handle; + cmd->srq_handle = attr->srq->handle; + cmd->dc_key = attr->dc_key; + cmd->port = attr->port; + cmd->access_flags = attr->access_flags; + cmd->min_rnr_timer = attr->min_rnr_timer; + cmd->tclass = attr->tclass; + cmd->flow_label = attr->flow_label; + cmd->mtu = attr->mtu; + cmd->pkey_index = attr->pkey_index; + cmd->gid_index = attr->gid_index; + cmd->hop_limit = attr->hop_limit; + cmd->inline_size = attr->inline_size; + if (~IBV_EXP_DCT_CREATE_FLAGS_MASK & attr->create_flags) + return EINVAL; + + cmd->create_flags = attr->create_flags; + if (write(context->cmd_fd, cmd, wsize) != wsize) + goto err; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + + attr->inline_size = resp->inline_size; + dct->events_completed = 0; + pthread_mutex_init(&dct->mutex, NULL); + pthread_cond_init(&dct->cond, NULL); + + return 0; + +err: + return errno; +} + +int ibv_exp_cmd_destroy_dct(struct ibv_context *context, + struct ibv_exp_dct *dct, + struct ibv_exp_destroy_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_destroy_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz) +{ + int wsize = lib_cmd_sz + drv_cmd_sz; + + IBV_INIT_CMD_RESP_EXP(DESTROY_DCT, cmd, lib_cmd_sz, drv_cmd_sz, resp, lib_resp_sz, drv_resp_sz); + cmd->dct_handle = dct->handle; + + if (write(context->cmd_fd, cmd, wsize) != wsize) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + pthread_mutex_lock(&dct->mutex); + while (dct->events_completed != resp->events_reported) + pthread_cond_wait(&dct->cond, &dct->mutex); + pthread_mutex_unlock(&dct->mutex); + + return 0; +} + +int ibv_exp_cmd_query_dct(struct ibv_context *context, + struct ibv_exp_query_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_query_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz, + struct ibv_exp_dct_attr *attr) +{ + int wsize = lib_cmd_sz + drv_cmd_sz; + + IBV_INIT_CMD_RESP_EXP(QUERY_DCT, cmd, lib_cmd_sz, drv_cmd_sz, resp, lib_resp_sz, drv_resp_sz); + + if (write(context->cmd_fd, cmd, wsize) != wsize) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + attr->dc_key = resp->dc_key; + attr->port = resp->port; + attr->access_flags = resp->access_flags; + attr->min_rnr_timer = resp->min_rnr_timer; + attr->tclass = resp->tclass; + attr->flow_label = resp->flow_label; + attr->mtu = resp->mtu; + attr->pkey_index = resp->pkey_index; + attr->gid_index = resp->gid_index; + attr->hop_limit = resp->hop_limit; + attr->key_violations = resp->key_violations; + attr->state = resp->state; + + return 0; +} + +int ibv_exp_cmd_arm_dct(struct ibv_context *context, + struct ibv_exp_arm_attr *attr, + struct ibv_exp_arm_dct *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_arm_dct_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz) +{ + int wsize = lib_cmd_sz + drv_cmd_sz; + + if (attr->comp_mask) { + errno = EINVAL; + return errno; + } + + IBV_INIT_CMD_RESP_EXP(ARM_DCT, cmd, lib_cmd_sz, drv_cmd_sz, resp, lib_resp_sz, drv_resp_sz); + + if (write(context->cmd_fd, cmd, wsize) != wsize) + return errno; + + return 0; +} + +int ibv_exp_cmd_modify_cq(struct ibv_cq *cq, + struct ibv_exp_cq_attr *attr, + int attr_mask, + struct ibv_exp_modify_cq *cmd, size_t cmd_size) +{ + IBV_INIT_CMD_EXP(MODIFY_CQ, cmd, cmd_size, 0); + + if (attr->comp_mask >= IBV_EXP_CQ_ATTR_RESERVED) + return ENOSYS; + + cmd->comp_mask = 0; + cmd->cq_handle = cq->handle; + cmd->attr_mask = attr_mask; + cmd->cq_count = attr->moderation.cq_count; + cmd->cq_period = attr->moderation.cq_period; + + if (attr->cq_cap_flags & ~IBV_EXP_CQ_CAP_MASK) + return EINVAL; + else + cmd->cq_cap_flags = attr->cq_cap_flags; + + if (write(cq->context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; + + return 0; +} + +int ibv_exp_cmd_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, + uint64_t exp_attr_mask, struct ibv_exp_modify_qp *cmd, + size_t cmd_size) +{ + if (attr->comp_mask >= IBV_EXP_QP_ATTR_RESERVED) + return ENOSYS; + + IBV_INIT_CMD_EXP(MODIFY_QP, cmd, cmd_size, 0); + + + cmd->qp_handle = qp->handle; + cmd->attr_mask = (__u32)exp_attr_mask; + cmd->qkey = attr->qkey; + cmd->rq_psn = attr->rq_psn; + cmd->sq_psn = attr->sq_psn; + cmd->dest_qp_num = attr->dest_qp_num; + cmd->qp_access_flags = attr->qp_access_flags; + cmd->pkey_index = attr->pkey_index; + cmd->alt_pkey_index = attr->alt_pkey_index; + cmd->qp_state = attr->qp_state; + cmd->cur_qp_state = attr->cur_qp_state; + cmd->path_mtu = attr->path_mtu; + cmd->path_mig_state = attr->path_mig_state; + cmd->en_sqd_async_notify = attr->en_sqd_async_notify; + cmd->max_rd_atomic = attr->max_rd_atomic; + cmd->max_dest_rd_atomic = attr->max_dest_rd_atomic; + cmd->min_rnr_timer = attr->min_rnr_timer; + cmd->port_num = attr->port_num; + cmd->timeout = attr->timeout; + cmd->retry_cnt = attr->retry_cnt; + cmd->rnr_retry = attr->rnr_retry; + cmd->alt_port_num = attr->alt_port_num; + cmd->alt_timeout = attr->alt_timeout; + + memcpy(cmd->dest.dgid, attr->ah_attr.grh.dgid.raw, 16); + cmd->dest.flow_label = attr->ah_attr.grh.flow_label; + cmd->dest.dlid = attr->ah_attr.dlid; + cmd->dest.reserved = 0; + cmd->dest.sgid_index = attr->ah_attr.grh.sgid_index; + cmd->dest.hop_limit = attr->ah_attr.grh.hop_limit; + cmd->dest.traffic_class = attr->ah_attr.grh.traffic_class; + cmd->dest.sl = attr->ah_attr.sl; + cmd->dest.src_path_bits = attr->ah_attr.src_path_bits; + cmd->dest.static_rate = attr->ah_attr.static_rate; + cmd->dest.is_global = attr->ah_attr.is_global; + cmd->dest.port_num = attr->ah_attr.port_num; + + memcpy(cmd->alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); + cmd->alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; + cmd->alt_dest.dlid = attr->alt_ah_attr.dlid; + cmd->alt_dest.reserved = 0; + cmd->alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; + cmd->alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; + cmd->alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; + cmd->alt_dest.sl = attr->alt_ah_attr.sl; + cmd->alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; + cmd->alt_dest.static_rate = attr->alt_ah_attr.static_rate; + cmd->alt_dest.is_global = attr->alt_ah_attr.is_global; + cmd->alt_dest.port_num = attr->alt_ah_attr.port_num; + cmd->dct_key = attr->dct_key; + cmd->exp_attr_mask = (__u32)(exp_attr_mask >> IBV_EXP_START_FLAG_LOC); + if (attr->comp_mask & IBV_EXP_QP_ATTR_FLOW_ENTROPY) + cmd->flow_entropy = attr->flow_entropy; + cmd->reserved[0] = 0; + cmd->reserved[1] = 0; + cmd->comp_mask = attr->comp_mask; + + if (write(qp->context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; + + if (exp_attr_mask & IBV_EXP_QP_STATE) + qp->state = attr->qp_state; + + return 0; +} + +int ibv_exp_cmd_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector, struct ibv_cq *cq, + struct ibv_exp_create_cq *cmd, size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_create_cq_resp *resp, size_t lib_resp_sz, size_t drv_resp_sz, + struct ibv_exp_cq_init_attr *attr) +{ + int wsize = lib_cmd_sz + drv_cmd_sz; + + IBV_INIT_CMD_RESP_EXP(CREATE_CQ, cmd, lib_cmd_sz, drv_cmd_sz, resp, + lib_resp_sz, drv_resp_sz); + + cmd->comp_mask = 0; + cmd->user_handle = (uintptr_t) cq; + cmd->cqe = cqe; + cmd->comp_vector = comp_vector; + cmd->comp_channel = channel ? channel->fd : -1; + cmd->reserved = 0; + + if (attr->comp_mask > IBV_EXP_CQ_INIT_ATTR_RESERVED1) + return ENOSYS; + + if (attr->comp_mask & IBV_EXP_CQ_INIT_ATTR_FLAGS) { + if (attr->flags & ~IBV_EXP_CQ_CREATE_FLAGS_MASK) + return ENOSYS; + + cmd->comp_mask |= IBV_EXP_CREATE_CQ_CAP_FLAGS; + cmd->create_flags = attr->flags; + } + + if (write(context->cmd_fd, cmd, wsize) != wsize) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + + cq->handle = resp->cq_handle; + cq->cqe = resp->cqe; + cq->context = context; + + return 0; +} + +int ibv_exp_cmd_create_mr(struct ibv_exp_create_mr_in *in, + struct ibv_mr *mr, + struct ibv_exp_create_mr *cmd, + size_t lib_cmd_sz, + size_t drv_cmd_sz, + struct ibv_exp_create_mr_resp *resp, + size_t lib_resp_sz, + size_t drv_resp_sz) +{ + struct ibv_pd *pd = in->pd; + struct ibv_context *context = pd->context; + struct ibv_exp_mr_init_attr *mr_init_attr = &in->attr; + + int wsize = lib_cmd_sz + drv_cmd_sz; + + IBV_INIT_CMD_RESP_EXP(CREATE_MR, cmd, lib_cmd_sz, drv_cmd_sz, resp, + lib_resp_sz, drv_resp_sz); + + cmd->pd_handle = pd->handle; + cmd->max_klm_list_size = mr_init_attr->max_klm_list_size; + cmd->create_flags = mr_init_attr->create_flags; + cmd->exp_access_flags = mr_init_attr->exp_access_flags; + cmd->comp_mask = in->comp_mask; + + if (write(context->cmd_fd, cmd, wsize) != wsize) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + + mr->handle = resp->handle; + mr->lkey = resp->lkey; + mr->rkey = resp->rkey; + mr->context = pd->context; + + return 0; +} + +int ibv_exp_cmd_query_mkey(struct ibv_context *context, + struct ibv_mr *mr, + struct ibv_exp_mkey_attr *mkey_attr, + struct ibv_exp_query_mkey *cmd, size_t lib_cmd_sz, + size_t drv_cmd_sz, + struct ibv_exp_query_mkey_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz) +{ + int wsize = lib_cmd_sz + drv_cmd_sz; + + IBV_INIT_CMD_RESP_EXP(QUERY_MKEY, cmd, lib_cmd_sz, drv_cmd_sz, resp, + lib_resp_sz, drv_resp_sz); + + cmd->handle = mr->handle; + cmd->lkey = mr->lkey; + cmd->rkey = mr->rkey; + cmd->comp_mask = mkey_attr->comp_mask; + cmd->reserved = 0; + + if (write(context->cmd_fd, cmd, wsize) != wsize) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + + mkey_attr->max_klm_list_size = resp->max_klm_list_size; + + return 0; +} + +int ibv_cmd_exp_reg_mr( + const struct ibv_exp_reg_mr_in *mr_init_attr, + uint64_t hca_va, struct ibv_mr *mr, + struct ibv_exp_reg_mr *cmd, + size_t cmd_size, + struct ibv_exp_reg_mr_resp *resp, + size_t resp_size) +{ + struct ibv_pd *pd = mr_init_attr->pd; + + if (mr_init_attr->comp_mask >= IBV_EXP_REG_MR_RESERVED) + return EINVAL; + + IBV_INIT_CMD_RESP_EXP(REG_MR, cmd, cmd_size, 0, resp, resp_size, 0); + + cmd->comp_mask = 0; + cmd->start = (uintptr_t) mr_init_attr->addr; + cmd->length = mr_init_attr->length; + cmd->hca_va = hca_va; + cmd->pd_handle = pd->handle; + cmd->reserved = 0; + cmd->exp_access_flags = mr_init_attr->exp_access; + + if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + + mr->handle = resp->mr_handle; + mr->lkey = resp->lkey; + mr->rkey = resp->rkey; + mr->context = pd->context; + + return 0; +} + +int ibv_exp_cmd_rereg_mr(struct ibv_mr *mr, uint32_t flags, void *addr, + size_t length, uint64_t hca_va, int access, + struct ibv_pd *pd, struct ibv_exp_rereg_mr_attr *attr, + struct ibv_exp_rereg_mr *cmd, + size_t lib_cmd_sz, size_t drv_cmd_sz, + struct ibv_exp_rereg_mr_resp *resp, + size_t lib_resp_sz, size_t drv_resp_sz) +{ + int wsize = lib_cmd_sz + drv_cmd_sz; + + if (attr->comp_mask & ~(IBV_EXP_REREG_MR_ATTR_RESERVED - 1)) + return -EINVAL; + + IBV_INIT_CMD_RESP_EXP(REREG_MR, cmd, lib_cmd_sz, drv_cmd_sz, resp, + lib_resp_sz, drv_resp_sz); + + cmd->comp_mask = 0; + cmd->mr_handle = mr->handle; + cmd->flags = flags; + cmd->start = (uintptr_t) addr; + cmd->length = length; + cmd->hca_va = hca_va; + cmd->pd_handle = (flags & IBV_EXP_REREG_MR_CHANGE_PD) ? pd->handle : 0; + cmd->access_flags = access; + + if (write(mr->context->cmd_fd, cmd, wsize) != wsize) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, sizeof(*resp)); + + mr->lkey = resp->lkey; + mr->rkey = resp->rkey; + if (flags & IBV_EXP_REREG_MR_CHANGE_PD) + mr->context = pd->context; + + return 0; +} + +int ibv_cmd_exp_prefetch_mr(struct ibv_mr *mr, + struct ibv_exp_prefetch_attr *attr) +{ + struct ibv_exp_prefetch_mr cmd; + + IBV_INIT_CMD_EXP(PREFETCH_MR, &cmd, sizeof(cmd), 0); + + if (attr->comp_mask >= IBV_EXP_PREFETCH_MR_RESERVED) + return EINVAL; + + if (attr->flags & ~IBV_EXP_PREFETCH_WRITE_ACCESS) + return EINVAL; + + cmd.comp_mask = 0; + cmd.mr_handle = mr->handle; + cmd.flags = attr->flags; + cmd.start = (uintptr_t) attr->addr; + cmd.length = attr->length; + + if (write(mr->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) + return errno; + + return 0; +} + +int ibv_exp_cmd_create_wq(struct ibv_context *context, + struct ibv_exp_wq_init_attr *wq_init_attr, + struct ibv_exp_wq *wq, + struct ibv_exp_create_wq *cmd, + size_t cmd_core_size, + size_t cmd_size, + struct ibv_exp_create_wq_resp *resp, + size_t resp_core_size, + size_t resp_size) +{ + int err; + + IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, + EXP_CREATE_WQ, resp, + resp_core_size, resp_size); + + cmd->user_handle = (uintptr_t)wq; + cmd->pd_handle = wq_init_attr->pd->handle; + cmd->cq_handle = wq_init_attr->cq->handle; + cmd->srq_handle = wq_init_attr->srq ? wq_init_attr->srq->handle : -1; + cmd->wq_type = wq_init_attr->wq_type; + cmd->max_recv_sge = wq_init_attr->max_recv_sge; + cmd->max_recv_wr = wq_init_attr->max_recv_wr; + cmd->reserved = 0; + cmd->comp_mask = 0; + + if (wq_init_attr->comp_mask & IBV_EXP_CREATE_WQ_MP_RQ) { + if (cmd_core_size >= offsetof(struct ibv_exp_create_wq, mp_rq) + + sizeof(struct ibv_exp_cmd_wq_mp_rq)) { + cmd->mp_rq.use_shift = wq_init_attr->mp_rq.use_shift; + cmd->mp_rq.single_stride_log_num_of_bytes = wq_init_attr->mp_rq.single_stride_log_num_of_bytes; + cmd->mp_rq.single_wqe_log_num_of_strides = wq_init_attr->mp_rq.single_wqe_log_num_of_strides; + cmd->mp_rq.reserved = 0; + cmd->comp_mask |= IBV_EXP_CMD_CREATE_WQ_MP_RQ; + } else { + /* Provider lib is not supporting Multi-Packet RQ */ + return EINVAL; + } + } + + err = write(context->cmd_fd, cmd, cmd_size); + if (err != cmd_size) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + + if (resp->response_length < resp_core_size) + return EINVAL; + + wq->handle = resp->wq_handle; + wq_init_attr->max_recv_wr = resp->max_recv_wr; + wq_init_attr->max_recv_sge = resp->max_recv_sge; + wq->wq_num = resp->wqn; + wq->context = context; + wq->cq = wq_init_attr->cq; + wq->pd = wq_init_attr->pd; + wq->srq = wq_init_attr->srq; + wq->wq_type = wq_init_attr->wq_type; + + return 0; +} + +int ibv_exp_cmd_modify_wq(struct ibv_exp_wq *wq, struct ibv_exp_wq_attr *attr, + struct ib_exp_modify_wq *cmd, size_t cmd_size) +{ + IBV_INIT_CMD_EX(cmd, cmd_size, EXP_MODIFY_WQ); + + cmd->curr_wq_state = attr->curr_wq_state; + cmd->wq_state = attr->wq_state; + cmd->wq_handle = wq->handle; + cmd->comp_mask = attr->attr_mask; + + if (write(wq->context->cmd_fd, cmd, cmd_size) != cmd_size) + return errno; + + if (attr->attr_mask & IBV_EXP_WQ_ATTR_STATE) + wq->state = attr->wq_state; + + return 0; +} + +int ibv_exp_cmd_destroy_wq(struct ibv_exp_wq *wq) +{ + struct ib_exp_destroy_wq cmd; + int ret = 0; + + memset(&cmd, 0, sizeof(cmd)); + IBV_INIT_CMD_EX(&cmd, sizeof(cmd), EXP_DESTROY_WQ); + cmd.wq_handle = wq->handle; + + if (write(wq->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) + ret = errno; + + return ret; +} + +int ibv_exp_cmd_create_rwq_ind_table(struct ibv_context *context, + struct ibv_exp_rwq_ind_table_init_attr *init_attr, + struct ibv_exp_rwq_ind_table *rwq_ind_table, + struct ibv_exp_create_rwq_ind_table *cmd, + size_t cmd_core_size, + size_t cmd_size, + struct ibv_exp_create_rwq_ind_table_resp *resp, + size_t resp_core_size, + size_t resp_size) +{ + int err, i; + uint32_t required_tbl_size, alloc_tbl_size; + uint32_t *tbl_start; + int num_tbl_entries; + + alloc_tbl_size = cmd_core_size - sizeof(*cmd); + num_tbl_entries = 1 << init_attr->log_ind_tbl_size; + + /* Data must be u64 aligned */ + required_tbl_size = (num_tbl_entries * sizeof(uint32_t)) < sizeof(uint64_t) ? + sizeof(uint64_t) : (num_tbl_entries * sizeof(uint32_t)); + + if (alloc_tbl_size < required_tbl_size) + return EINVAL; + + tbl_start = (uint32_t *)((uint8_t *)cmd + sizeof(*cmd)); + for (i = 0; i < num_tbl_entries; i++) + tbl_start[i] = init_attr->ind_tbl[i]->handle; + + IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, + EXP_CREATE_RWQ_IND_TBL, resp, + resp_core_size, resp_size); + + cmd->pd_handle = init_attr->pd->handle; + cmd->log_ind_tbl_size = init_attr->log_ind_tbl_size; + cmd->reserved = 0; + cmd->comp_mask = 0; + + err = write(context->cmd_fd, cmd, cmd_size); + if (err != cmd_size) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + + if (resp->response_length < resp_core_size) + return EINVAL; + + rwq_ind_table->ind_tbl_handle = resp->ind_tbl_handle; + rwq_ind_table->ind_tbl_num = resp->ind_tbl_num; + rwq_ind_table->context = context; + return 0; +} + +int ibv_exp_cmd_destroy_rwq_ind_table(struct ibv_exp_rwq_ind_table *rwq_ind_table) +{ + struct ibv_exp_destroy_rwq_ind_table cmd; + int ret = 0; + + memset(&cmd, 0, sizeof(cmd)); + IBV_INIT_CMD_EX(&cmd, sizeof(cmd), EXP_DESTROY_RWQ_IND_TBL); + cmd.ind_tbl_handle = rwq_ind_table->ind_tbl_handle; + + if (write(rwq_ind_table->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) + ret = errno; + + return ret; +} Index: contrib/ofed/libibverbs/src/compat-1_0.c =================================================================== --- contrib/ofed/libibverbs/src/compat-1_0.c +++ contrib/ofed/libibverbs/src/compat-1_0.c @@ -238,7 +238,7 @@ l = calloc(n + 2, sizeof (struct ibv_device_1_0 *)); if (!l) - return NULL; + goto free_device_list; l[0] = (void *) real_list; @@ -258,6 +258,9 @@ for (i = 1; i <= n; ++i) if (l[i]) free(l[i]); + free(l); + +free_device_list: ibv_free_device_list(real_list); return NULL; } Index: contrib/ofed/libibverbs/src/device.c =================================================================== --- contrib/ofed/libibverbs/src/device.c +++ contrib/ofed/libibverbs/src/device.c @@ -44,37 +44,43 @@ #include #include #include +#include +#include #include #include "ibverbs.h" -static pthread_mutex_t device_list_lock = PTHREAD_MUTEX_INITIALIZER; +extern char **environ; + +static pthread_once_t device_list_once = PTHREAD_ONCE_INIT; static int num_devices; static struct ibv_device **device_list; +static void count_devices(void) +{ + num_devices = ibverbs_init(&device_list); +} + struct ibv_device **__ibv_get_device_list(int *num) { - struct ibv_device **l = 0; + struct ibv_device **l; int i; if (num) *num = 0; - pthread_mutex_lock(&device_list_lock); - - if (!num_devices) - num_devices = ibverbs_init(&device_list); + pthread_once(&device_list_once, count_devices); if (num_devices < 0) { errno = -num_devices; - goto out; + return NULL; } l = calloc(num_devices + 1, sizeof (struct ibv_device *)); if (!l) { errno = ENOMEM; - goto out; + return NULL; } for (i = 0; i < num_devices; ++i) @@ -82,8 +88,6 @@ if (num) *num = num_devices; -out: - pthread_mutex_unlock(&device_list_lock); return l; } default_symver(__ibv_get_device_list, ibv_get_device_list); @@ -122,11 +126,455 @@ } default_symver(__ibv_get_device_guid, ibv_get_device_guid); +static int __ibv_exp_modify_cq(struct ibv_cq *cq, + struct ibv_exp_cq_attr *attr, + int attr_mask) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + return ENOSYS; +} + +static int __ibv_exp_query_device(struct ibv_context *context, + struct ibv_exp_device_attr *attr) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + return ENOSYS; +} + +static int __ibv_exp_post_task(struct ibv_context *context, + struct ibv_exp_task *task_list, + struct ibv_exp_task **bad_task) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + return ENOSYS; +} + +int __ibv_exp_prefetch_mr(struct ibv_mr *mr, struct ibv_exp_prefetch_attr *attr) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + return ENOSYS; +} + +static int __ibv_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + return ENOSYS; +} + +static int __ibv_exp_arm_dct(struct ibv_exp_dct *dct, + struct ibv_exp_arm_attr *attr) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + return ENOSYS; +} + +static int __ibv_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, + uint64_t exp_attr_mask) +{ + struct verbs_context_exp *vctx; + int ret; + + vctx = verbs_get_exp_ctx_op(qp->context, drv_exp_modify_qp); + if (!vctx) { + errno = ENOSYS; + return errno; + } + ret = vctx->drv_exp_modify_qp(qp, attr, exp_attr_mask); + if (ret) + return ret; + + if (exp_attr_mask & IBV_EXP_QP_STATE) + qp->state = attr->qp_state; + + return 0; +} + +struct ibv_mr *__ibv_exp_create_mr(struct ibv_exp_create_mr_in *in) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + errno = ENOSYS; + return NULL; +} + +struct ibv_exp_mkey_list_container *__ibv_exp_alloc_mkey_list_memory(struct ibv_exp_mkey_list_container_attr *attr) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + errno = ENOSYS; + return NULL; +} + +int __ibv_exp_dealloc_mkey_list_memory(struct ibv_exp_mkey_list_container *mem) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + errno = ENOSYS; + return errno; +} + +int __ibv_exp_query_mkey(struct ibv_mr *mr, + struct ibv_exp_mkey_attr *query_mkey_in) +{ + fprintf(stderr, PFX "Fatal: device doesn't support function.\n"); + errno = ENOSYS; + return errno; +} + +int __ibv_exp_rereg_mr(struct ibv_mr *mr, int flags, + struct ibv_pd *pd, void *addr, + size_t length, uint64_t access, + struct ibv_exp_rereg_mr_attr *attr) +{ + int dofork_onfail = 0; + int err; + struct verbs_context_exp *vctx; + void *old_addr; + size_t old_len; + struct ibv_exp_rereg_out out; + + if (attr->comp_mask & ~(IBV_EXP_REREG_MR_ATTR_RESERVED - 1)) + return errno = EINVAL; + + if (flags & ~IBV_EXP_REREG_MR_FLAGS_SUPPORTED) + return errno = EINVAL; + + if ((flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION) && + (0 >= length)) + return errno = EINVAL; + + if (!(flags & IBV_EXP_REREG_MR_CHANGE_ACCESS)) + access = 0; + + if ((access & IBV_EXP_ACCESS_ALLOCATE_MR) && + (!(flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION) || + (addr != NULL))) + return errno = EINVAL; + + if ((!(access & IBV_EXP_ACCESS_ALLOCATE_MR)) && + (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION) && + (addr == NULL)) + return errno = EINVAL; + + vctx = verbs_get_exp_ctx_op(mr->context, drv_exp_rereg_mr); + if (!vctx) + return errno = ENOSYS; + + /* If address will be allocated internally fork support is handled by the provider */ + if (!(access & IBV_EXP_ACCESS_ALLOCATE_MR) && + flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION) { + err = ibv_dontfork_range(addr, length); + if (err) + return err; + dofork_onfail = 1; + } + + old_addr = mr->addr; + old_len = mr->length; + memset(&out, 0, sizeof(out)); + if (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION) + out.need_dofork = 1; + + err = vctx->drv_exp_rereg_mr(mr, flags, pd, addr, length, access, attr, &out); + if (!err) { + if (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION) { + if (out.need_dofork) + ibv_dofork_range(old_addr, old_len); + if (access & IBV_EXP_ACCESS_ALLOCATE_MR) { + ; + } else { + /* In case that internal allocator was used + addr already set internally + */ + mr->addr = addr; + mr->length = length; + } + } + if (flags & IBV_EXP_REREG_MR_CHANGE_PD) + mr->pd = pd; + } else if (dofork_onfail) { + ibv_dofork_range(addr, length); + } + + return err; +} + +static int __ibv_exp_query_gid_attr(struct ibv_context *context, + uint8_t port_num, + unsigned int index, + struct ibv_exp_gid_attr *attr) +{ + char *dir_path; + char name[32]; + char buff[41]; + DIR *dir; + + if (attr->comp_mask & ~(IBV_EXP_QUERY_GID_ATTR_RESERVED - 1)) + return ENOTSUP; + + if (attr->comp_mask & IBV_EXP_QUERY_GID_ATTR_TYPE) { + snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", + port_num, index); + if (ibv_read_sysfs_file(context->device->ibdev_path, name, buff, + sizeof(buff)) <= 0) { + if (asprintf(&dir_path, "%s/%s", + context->device->ibdev_path, + "ports/1/gid_attrs/") < 0) + return ENOMEM; + dir = opendir(dir_path); + free(dir_path); + if (!dir) { + if (errno == ENOENT) + /* Assuming that if gid_attrs doesn't + * exist, we have an old kernel and all + * GIDs are IB/RoCE v1 + */ + attr->type = IBV_EXP_IB_ROCE_V1_GID_TYPE; + else + return errno; + } else { + closedir(dir); + return EINVAL; + } + } else { + if (!strcmp(buff, "IB/RoCE V1")) + attr->type = IBV_EXP_IB_ROCE_V1_GID_TYPE; + else if (!strcmp(buff, "RoCE V2")) + attr->type = IBV_EXP_ROCE_V2_GID_TYPE; + else if (!strcmp(buff, "RoCE V1.5")) + attr->type = IBV_EXP_ROCE_V1_5_GID_TYPE; + else + return EINVAL; + } + } + + if (attr->comp_mask & IBV_EXP_QUERY_GID_ATTR_GID) { + if (ibv_query_gid(context, port_num, index, &attr->gid)) + return ENOENT; + } + + return 0; +} + +static void remove_env(struct verbs_environment *env, + struct verbs_env_item *cur, + struct verbs_env_item *prev) +{ + free(cur->name); + free(cur->value); + if (prev) + prev->next = cur->next; + else + env->head = cur->next; + free(cur); +} + +static int vsetenv(struct verbs_environment *env, const char *name, + const char *value, int overwrite) +{ + struct verbs_env_item *cur; + struct verbs_env_item *prev; + int err = 0; + int found = 0; + + errno = ENOMEM; + if (strlen(value) >= VERBS_MAX_ENV_VAL) { + fprintf(stderr, "Note: Max supported value for env var is %d\n", + VERBS_MAX_ENV_VAL - 1); + return -1; + } + pthread_mutex_lock(&env->mtx); + for (prev = NULL, cur = env->head; cur; prev = cur, cur = cur->next) { + if (!strcmp(cur->name, name)) { + found = 1; + if (!strcmp(cur->value, value)) + break; + + if (overwrite) { + free(cur->value); + cur->value = strdup(value); + if (!cur->value) { + remove_env(env, cur, prev); + err = -1; + } + } else { + errno = EEXIST; + err = -1; + } + break; + } + } + if (!found) { + cur = calloc(1, sizeof(*cur)); + if (!cur) + goto out; + + cur->name = strdup(name); + if (!cur->name) + goto out; + + cur->value = strdup(value); + if (!cur->value) + goto out; + + cur->next = env->head; + env->head = cur; + } + + pthread_mutex_unlock(&env->mtx); + return err; + +out: + if (cur) { + free(cur->name); + free(cur); + } + + pthread_mutex_unlock(&env->mtx); + return -1; +} + +static void clone_env(struct verbs_environment *venv) +{ + char *tmp; + int i; + char *p; + int err; + + /* The caller is responsible to ensure that no setenv or unsetenv + * is called in paraller to this function. Failing to do this might + * cause a segmentation fault. Sadly, POSIX starndard does not + * provide any thread safe way for doing this. + */ + for (i = 0; environ[i]; i++) { + tmp = strdup(environ[i]); + if (!tmp) { + fprintf(stderr, "strdup failed\n"); + continue; + } + p = strpbrk(tmp, "="); + if (!p) { + fprintf(stderr, "could not find = char\n"); + free(tmp); + continue; + } + *p = '\0'; + err = vsetenv(venv, tmp, p + 1, 0); + if (err) { + fprintf(stderr, "vsetenv %s failed\n", tmp); + free(tmp); + continue; + } + + free(tmp); + } +} + +static int check_space_copy(char *value, const char *s, size_t n) +{ + int len; + + len = strlen(s); + if (len >= n) + return len + 1; + strcpy(value, s); + return 0; +} + +static int vgetenv(struct verbs_environment *env, const char *name, + char *value, size_t n) +{ + struct verbs_env_item *cur; + int ret = -1; + + pthread_mutex_lock(&env->mtx); + for (cur = env->head; cur; cur = cur->next) { + if (!strcmp(cur->name, name)) { + ret = check_space_copy(value, cur->value, n); + break; + } + } + pthread_mutex_unlock(&env->mtx); + return ret; +} + +int ibv_exp_cmd_getenv(struct ibv_context *context, const char *name, char *value, size_t n) +{ + struct verbs_context_exp *vctx; + char *res; + + vctx = verbs_get_exp_ctx(context); + if (vctx && vctx->venv) + return vgetenv(vctx->venv, name, value, n); + + res = getenv(name); + if (res) + return check_space_copy(value, res, n); + + return -1; +} + +static pthread_mutex_t venv_mutex = PTHREAD_MUTEX_INITIALIZER; + +static int __ibv_exp_use_priv_env(struct ibv_context *context) +{ + struct verbs_context_exp *vctx; + int err; + struct verbs_environment *venv; + + vctx = verbs_get_exp_ctx_op(context, lib_exp_use_priv_env); + if (!vctx) { + errno = ENOSYS; + fprintf(stderr, "could not retrieve context\n"); + return -1; + } + pthread_mutex_lock(&venv_mutex); + if (!vctx->venv) { + venv = calloc(1, sizeof(*venv)); + if (!venv) { + errno = ENOMEM; + err = -1; + goto out; + } + venv->head = NULL; + if (pthread_mutex_init(&venv->mtx, NULL)) { + err = -1; + goto out; + } + clone_env(venv); + vctx->venv = venv; + } + pthread_mutex_unlock(&venv_mutex); + return 0; + +out: + free(venv); + pthread_mutex_unlock(&venv_mutex); + return err; +} + +static int __ibv_exp_setenv(struct ibv_context *context, + const char *name, + const char *value, + int overwrite) +{ + struct verbs_context_exp *vctx; + struct verbs_environment *venv; + + vctx = verbs_get_exp_ctx_op(context, lib_exp_setenv); + if (vctx) + venv = vctx->venv; + + return vctx && venv ? + vsetenv(venv, name, value, overwrite) : + setenv(name, value, overwrite); +} + struct ibv_context *__ibv_open_device(struct ibv_device *device) { + struct verbs_device *verbs_device = verbs_get_device(device); char *devpath; - int cmd_fd; + int cmd_fd, ret; struct ibv_context *context; + struct verbs_context *context_ex; + struct verbs_context_exp *context_exp; if (asprintf(&devpath, "/dev/%s", device->dev_name) < 0) return NULL; @@ -141,9 +589,96 @@ if (cmd_fd < 0) return NULL; - context = device->ops.alloc_context(device, cmd_fd); - if (!context) - goto err; + if (!verbs_device) { + context = device->ops.alloc_context(device, cmd_fd); + if (!context) + goto err; + } else { + /* Library now allocates the context */ + context_exp = calloc(1, sizeof(*context_ex) + sizeof(*context_exp) + + verbs_device->size_of_context); + if (!context_exp) { + errno = ENOMEM; + goto err; + } + + context_ex = (struct verbs_context *)((void *)context_exp + sizeof(*context_exp)); + context_exp->sz = sizeof(*context_exp); + context_ex->has_comp_mask |= VERBS_CONTEXT_EXP; + context_ex->context.abi_compat = __VERBS_ABI_IS_EXTENDED; + context_ex->sz = sizeof(*context_ex); + context_exp->exp_query_gid_attr = __ibv_exp_query_gid_attr; + + context = &context_ex->context; + ret = verbs_device->init_context(verbs_device, context, cmd_fd); + if (ret) + goto verbs_err; + + /* initialize *all* library ops to either lib calls or + * directly to provider calls. + * context_ex->lib_new_func1 = __verbs_new_func1; + * context_ex->lib_new_func2 = __verbs_new_func2; + */ + + /* initialize *all* library experimental ops to either lib calls or + * directly to provider calls. + * context_exp->lib_new_func1 = __verbs_new_func1; + * context_exp->lib_new_func2 = __verbs_new_func2; + */ + context_exp->lib_exp_create_qp = context_exp->drv_exp_create_qp; + context_exp->lib_exp_query_device = context_exp->drv_exp_query_device; + context_exp->lib_exp_query_port = + context_exp->drv_exp_query_port; + + context_exp->lib_exp_ibv_reg_shared_mr = __ibv_reg_shared_mr; + context_exp->lib_exp_ibv_create_flow = + context_exp->drv_exp_ibv_create_flow; + context_exp->lib_exp_ibv_destroy_flow = + context_exp->drv_exp_ibv_destroy_flow; + context_exp->lib_exp_modify_cq = (context_exp->drv_exp_modify_cq ? + context_exp->drv_exp_modify_cq : + __ibv_exp_modify_cq); + context_exp->lib_exp_query_device = (context_exp->drv_exp_query_device ? + context_exp->drv_exp_query_device : + __ibv_exp_query_device); + + context_exp->lib_exp_modify_qp = __ibv_exp_modify_qp; + + context_exp->lib_exp_post_task = (context_exp->drv_exp_post_task ? + context_exp->drv_exp_post_task : + __ibv_exp_post_task); + context_exp->lib_exp_reg_mr = __ibv_exp_reg_mr; + context_exp->lib_exp_bind_mw = (context_exp->drv_exp_bind_mw ? + context_exp->drv_exp_bind_mw : + __ibv_exp_bind_mw); + context_exp->lib_exp_arm_dct = (context_exp->drv_exp_arm_dct ? + context_exp->drv_exp_arm_dct : + __ibv_exp_arm_dct); + context_exp->lib_exp_create_mr = (context_exp->drv_exp_create_mr ? + context_exp->drv_exp_create_mr : + __ibv_exp_create_mr); + context_exp->lib_exp_query_mkey = (context_exp->drv_exp_query_mkey ? + context_exp->drv_exp_query_mkey : + __ibv_exp_query_mkey); + context_exp->lib_exp_dealloc_mkey_list_memory = (context_exp->drv_exp_dealloc_mkey_list_memory ? + context_exp->drv_exp_dealloc_mkey_list_memory : + __ibv_exp_dealloc_mkey_list_memory); + context_exp->lib_exp_alloc_mkey_list_memory = (context_exp->drv_exp_alloc_mkey_list_memory ? + context_exp->drv_exp_alloc_mkey_list_memory : + __ibv_exp_alloc_mkey_list_memory); + context_exp->lib_exp_prefetch_mr = + (context_exp->drv_exp_prefetch_mr ? + context_exp->drv_exp_prefetch_mr : + __ibv_exp_prefetch_mr); + + context_exp->exp_rereg_mr = __ibv_exp_rereg_mr; + context_exp->lib_exp_use_priv_env = __ibv_exp_use_priv_env; + context_exp->lib_exp_setenv = __ibv_exp_setenv; + ret = ibv_exp_use_priv_env(context); + if (ret) + fprintf(stderr, PFX "Warning: ibv_exp_use_priv_env failed, errno: %d\n", errno); + + } context->device = device; context->cmd_fd = cmd_fd; @@ -151,31 +686,54 @@ return context; +verbs_err: + free(context_exp); err: close(cmd_fd); - return NULL; } default_symver(__ibv_open_device, ibv_open_device); +static void clear_env(struct verbs_environment *venv) +{ + struct verbs_env_item *cur; + struct verbs_env_item *tmp; + + if (!venv) + return; + + pthread_mutex_lock(&venv->mtx); + for (cur = venv->head; cur;) { + free(cur->name); + free(cur->value); + tmp = cur->next; + free(cur); + cur = tmp; + } + pthread_mutex_unlock(&venv->mtx); + free(venv); +} + int __ibv_close_device(struct ibv_context *context) { int async_fd = context->async_fd; int cmd_fd = context->cmd_fd; - int cq_fd = -1; - if (abi_ver <= 2) { - struct ibv_abi_compat_v2 *t = context->abi_compat; - cq_fd = t->channel.fd; - free(context->abi_compat); - } + struct verbs_context_exp *context_exp; - context->device->ops.free_context(context); + context_exp = verbs_get_exp_ctx(context); + + if (context_exp) { + struct verbs_device *verbs_device = verbs_get_device(context->device); + verbs_device->uninit_context(verbs_device, context); + clear_env(context_exp->venv); + free(context_exp); + } else { + context->device->ops.free_context(context); + } close(async_fd); close(cmd_fd); - if (abi_ver <= 2) - close(cq_fd); return 0; } @@ -185,20 +743,31 @@ struct ibv_async_event *event) { struct ibv_kern_async_event ev; + struct verbs_context_exp *vctx; + struct ibv_srq_legacy *ibv_srq_legacy = NULL; + struct ibv_qp *qp; + enum ibv_event_rsc_type rsc_type; if (read(context->async_fd, &ev, sizeof ev) != sizeof ev) return -1; event->event_type = ev.event_type; + rsc_type = ev.rsc_type; - if (event->event_type & IBV_XRC_QP_EVENT_FLAG) { - event->element.xrc_qp_num = ev.element; - } else + switch (rsc_type) { + case IBV_EVENT_RSC_CQ: + event->element.cq = (void *)(uintptr_t)ev.element; switch (event->event_type) { case IBV_EVENT_CQ_ERR: - event->element.cq = (void *) (uintptr_t) ev.element; break; + default: + fprintf(stderr, "Invalid CQ event (%d)\n", event->event_type); + } + break; + case IBV_EVENT_RSC_QP: + event->element.qp = (void *)(uintptr_t)ev.element; + switch (event->event_type) { case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: @@ -207,17 +776,70 @@ case IBV_EVENT_PATH_MIG: case IBV_EVENT_PATH_MIG_ERR: case IBV_EVENT_QP_LAST_WQE_REACHED: - event->element.qp = (void *) (uintptr_t) ev.element; + qp = ibv_find_xrc_qp(event->element.qp->qp_num); + if (qp) { + /* This is XRC reciever QP created by the legacy API */ + event->event_type |= IBV_XRC_QP_EVENT_FLAG; + event->element.qp = NULL; + event->element.xrc_qp_num = qp->qp_num; + } break; + default: + fprintf(stderr, "Invalid QP event (%d)\n", event->event_type); + } + break; + case IBV_EVENT_RSC_DCT: + event->element.dct = (void *)(uintptr_t)ev.element; + switch (event->event_type) { + case IBV_EXP_EVENT_DCT_KEY_VIOLATION: + case IBV_EXP_EVENT_DCT_ACCESS_ERR: + case IBV_EXP_EVENT_DCT_REQ_ERR: + break; + default: + fprintf(stderr, "Invalid DCT event (%d)\n", event->event_type); + } + break; + + case IBV_EVENT_RSC_SRQ: + vctx = verbs_get_exp_ctx_op(context, drv_exp_get_legacy_xrc); + if (vctx) + /* ev.elemant is ibv_srq comes from the kernel, in case there is leagcy one + * it should be returened instead. + */ + ibv_srq_legacy = vctx->drv_exp_get_legacy_xrc((void *) (uintptr_t) ev.element); + + event->element.srq = (ibv_srq_legacy) ? (void *)ibv_srq_legacy : + (void *) (uintptr_t) ev.element; + switch (event->event_type) { case IBV_EVENT_SRQ_ERR: case IBV_EVENT_SRQ_LIMIT_REACHED: - event->element.srq = (void *) (uintptr_t) ev.element; break; default: - event->element.port_num = ev.element; + fprintf(stderr, "Invalid SRQ event (%d)\n", event->event_type); + } + + break; + + case IBV_EVENT_RSC_DEVICE: + event->element.port_num = ev.element; + switch (event->event_type) { + case IBV_EVENT_DEVICE_FATAL: + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_PORT_ERR: + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_PKEY_CHANGE: + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_CLIENT_REREGISTER: + case IBV_EVENT_GID_CHANGE: break; + default: + fprintf(stderr, "Invalid Device event (%d)\n", event->event_type); } + break; + default: + fprintf(stderr, "Invalid resource type (%d)\n", rsc_type); + } if (context->ops.async_event) context->ops.async_event(event); @@ -228,6 +850,14 @@ void __ibv_ack_async_event(struct ibv_async_event *event) { + int is_legacy_xrc = 0; + struct ibv_exp_dct *dct; + + if (event->event_type & IBV_XRC_QP_EVENT_FLAG) { + event->event_type ^= IBV_XRC_QP_EVENT_FLAG; + is_legacy_xrc = 1; + } + switch (event->event_type) { case IBV_EVENT_CQ_ERR: { @@ -252,6 +882,19 @@ { struct ibv_qp *qp = event->element.qp; + if (is_legacy_xrc) { + /* Looking for ibv_qp for this XRC reciever QPN */ + qp = ibv_find_xrc_qp(event->element.xrc_qp_num); + /* Even if found a qp making sure that it matches, would like + * to prevent rare case while pointer value was matched to qp number. + */ + if (!qp || qp->qp_num != event->element.xrc_qp_num) { + fprintf(stderr, PFX "Warning: ibv_ack_async_event, XRC qpn=%u wasn't found\n", + event->element.xrc_qp_num); + return; + } + } + pthread_mutex_lock(&qp->mutex); ++qp->events_completed; pthread_cond_signal(&qp->cond); @@ -265,6 +908,13 @@ { struct ibv_srq *srq = event->element.srq; + if (srq->handle == LEGACY_XRC_SRQ_HANDLE) { + struct ibv_srq_legacy *ibv_srq_legacy = + (struct ibv_srq_legacy *) srq; + srq = ibv_srq_legacy->ibv_srq; + } + + /* We should use here the internal mutx/cond even in legacy mode */ pthread_mutex_lock(&srq->mutex); ++srq->events_completed; pthread_cond_signal(&srq->cond); @@ -273,6 +923,16 @@ return; } + case IBV_EXP_EVENT_DCT_KEY_VIOLATION: + case IBV_EXP_EVENT_DCT_ACCESS_ERR: + case IBV_EXP_EVENT_DCT_REQ_ERR: + dct = event->element.dct; + pthread_mutex_lock(&dct->mutex); + dct->events_completed++; + pthread_cond_signal(&dct->cond); + pthread_mutex_unlock(&dct->mutex); + break; + default: return; } Index: contrib/ofed/libibverbs/src/enum_strs.c =================================================================== --- contrib/ofed/libibverbs/src/enum_strs.c +++ contrib/ofed/libibverbs/src/enum_strs.c @@ -40,11 +40,21 @@ [IBV_NODE_ROUTER] = "InfiniBand router", [IBV_NODE_RNIC] = "iWARP NIC" }; + static const char *const exp_node_type_str[] = { + [IBV_EXP_NODE_MIC - IBV_EXP_NODE_TYPE_START] = "MIC" + }; + + if (node_type < IBV_EXP_NODE_TYPE_START) { + if (node_type < IBV_NODE_CA || node_type > IBV_NODE_RNIC) + return "unknown"; - if (node_type < IBV_NODE_CA || node_type > IBV_NODE_RNIC) + return node_type_str[node_type]; + } + if (node_type > IBV_EXP_NODE_MIC) return "unknown"; - return node_type_str[node_type]; + return exp_node_type_str[node_type - IBV_EXP_NODE_TYPE_START]; + } const char *ibv_port_state_str(enum ibv_port_state port_state) @@ -85,9 +95,10 @@ [IBV_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", [IBV_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", [IBV_EVENT_CLIENT_REREGISTER] = "client reregistration", + [IBV_EVENT_GID_CHANGE] = "GID table change" }; - if (event < IBV_EVENT_CQ_ERR || event > IBV_EVENT_CLIENT_REREGISTER) + if (event < IBV_EVENT_CQ_ERR || event > IBV_EVENT_GID_CHANGE) return "unknown"; return event_type_str[event]; Index: contrib/ofed/libibverbs/src/ibverbs.h =================================================================== --- contrib/ofed/libibverbs/src/ibverbs.h +++ contrib/ofed/libibverbs/src/ibverbs.h @@ -37,6 +37,7 @@ #include #include +#include #ifdef HAVE_VALGRIND_MEMCHECK_H @@ -80,6 +81,9 @@ extern HIDDEN int abi_ver; HIDDEN int ibverbs_init(struct ibv_device ***list); +HIDDEN struct ibv_mr *__ibv_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in); +HIDDEN struct ibv_mr *__ibv_exp_reg_mr(struct ibv_exp_reg_mr_in *in); +HIDDEN struct ibv_qp *ibv_find_xrc_qp(uint32_t qpn); #define IBV_INIT_CMD(cmd, size, opcode) \ do { \ @@ -102,4 +106,52 @@ (cmd)->response = (uintptr_t) (out); \ } while (0) +#define IBV_INIT_CMD_RESP_EX_V(cmd, cmd_size, size, opcode, out, resp_size,\ + outsize) \ + do { \ + size_t c_size = cmd_size - sizeof(struct ex_hdr); \ + if (abi_ver > 2) \ + (cmd)->hdr.command = IB_USER_VERBS_CMD_##opcode; \ + else \ + (cmd)->hdr.command = \ + IB_USER_VERBS_CMD_##opcode##_V2; \ + (cmd)->hdr.in_words = ((c_size) / 8); \ + (cmd)->hdr.out_words = ((resp_size) / 8); \ + (cmd)->hdr.provider_in_words = (((size) - (cmd_size))/8);\ + (cmd)->hdr.provider_out_words = \ + (((outsize) - (resp_size)) / 8); \ + (cmd)->hdr.response = (uintptr_t) (out); \ + (cmd)->hdr.reserved = 0; \ + } while (0) + +#define IBV_INIT_CMD_RESP_EX_VCMD(cmd, cmd_size, size, opcode, out, outsize) \ + IBV_INIT_CMD_RESP_EX_V(cmd, cmd_size, size, opcode, out, \ + sizeof(*(out)), outsize) + +#define IBV_INIT_CMD_RESP_EX(cmd, size, opcode, out, outsize) \ + IBV_INIT_CMD_RESP_EX_V(cmd, sizeof(*(cmd)), size, opcode, out, \ + sizeof(*(out)), outsize) + +#define IBV_INIT_CMD_EX(cmd, size, opcode) \ + IBV_INIT_CMD_RESP_EX_V(cmd, sizeof(*(cmd)), size, opcode, NULL, 0, 0) + +#define IBV_INIT_CMD_RESP_EXP(opcode, cmd, cmd_size, drv_size, out, osize, \ + drv_osize) \ + do { \ + size_t c_size = cmd_size - sizeof(struct ex_hdr); \ + (cmd)->hdr.command = IB_USER_VERBS_EXP_CMD_##opcode + \ + IB_USER_VERBS_EXP_CMD_FIRST; \ + (cmd)->hdr.in_words = (c_size / 8); \ + (cmd)->hdr.out_words = (osize / 8); \ + (cmd)->hdr.provider_in_words = (drv_size / 8); \ + (cmd)->hdr.provider_out_words = (drv_osize / 8); \ + (cmd)->hdr.response = (uintptr_t) (out); \ + (cmd)->hdr.reserved = 0; \ + } while (0) + +#define IBV_INIT_CMD_EXP(opcode, cmd, cmd_size, drv_size) \ + IBV_INIT_CMD_RESP_EXP(opcode, cmd, cmd_size, drv_size, 0, 0, 0) + + +void ibv_set_huge_safe(void); #endif /* IB_VERBS_H */ Index: contrib/ofed/libibverbs/src/init.c =================================================================== --- contrib/ofed/libibverbs/src/init.c +++ contrib/ofed/libibverbs/src/init.c @@ -70,6 +70,7 @@ struct ibv_driver { const char *name; ibv_driver_init_func init_func; + verbs_driver_init_func verbs_init_func; struct ibv_driver *next; }; @@ -160,7 +161,7 @@ int i; snprintf(class_path, sizeof class_path, "%s/class/infiniband_verbs", - ibv_get_sysfs_path()); + ibv_get_sysfs_path()); for (i = 0; i < 256; i++) { if (!sysfs_dev) @@ -171,24 +172,24 @@ } snprintf(sysfs_dev->sysfs_path, sizeof sysfs_dev->sysfs_path, - "%s/uverbs%d", class_path, i); + "%s/uverbs%d", class_path, i); snprintf(sysfs_dev->sysfs_name, sizeof sysfs_dev->sysfs_name, - "uverbs%d", i); + "uverbs%d", i); if (ibv_read_sysfs_file(sysfs_dev->sysfs_path, "ibdev", - sysfs_dev->ibdev_name, - sizeof sysfs_dev->ibdev_name) < 0) + sysfs_dev->ibdev_name, + sizeof sysfs_dev->ibdev_name) < 0) continue; snprintf(sysfs_dev->ibdev_path, sizeof sysfs_dev->ibdev_path, - "%s/class/infiniband/%s", ibv_get_sysfs_path(), - sysfs_dev->ibdev_name); + "%s/class/infiniband/%s", ibv_get_sysfs_path(), + sysfs_dev->ibdev_name); - sysfs_dev->next = sysfs_dev_list; + sysfs_dev->next = sysfs_dev_list; sysfs_dev->have_driver = 0; if (ibv_read_sysfs_file(sysfs_dev->sysfs_path, "abi_version", - value, sizeof value) > 0) + value, sizeof value) > 0) sysfs_dev->abi_ver = strtol(value, NULL, 10); else sysfs_dev->abi_ver = 0; @@ -196,17 +197,16 @@ sysfs_dev_list = sysfs_dev; sysfs_dev = NULL; } - - out: +out: if (sysfs_dev) free(sysfs_dev); return ret; - #endif } -void ibv_register_driver(const char *name, ibv_driver_init_func init_func) +static void register_driver(const char *name, ibv_driver_init_func init_func, + verbs_driver_init_func verbs_init_func) { struct ibv_driver *driver; @@ -216,9 +216,10 @@ return; } - driver->name = name; - driver->init_func = init_func; - driver->next = NULL; + driver->name = name; + driver->init_func = init_func; + driver->verbs_init_func = verbs_init_func; + driver->next = NULL; if (tail_driver) tail_driver->next = driver; @@ -227,6 +228,19 @@ tail_driver = driver; } +void ibv_register_driver(const char *name, ibv_driver_init_func init_func) +{ + register_driver(name, init_func, NULL); +} + +/* New registration symbol with same functionality - used by providers to + * validate that library supports verbs extension. + */ +void verbs_register_driver(const char *name, verbs_driver_init_func init_func) +{ + register_driver(name, NULL, init_func); +} + static void load_driver(const char *name) { char *so_name; @@ -310,7 +324,7 @@ field = strsep(&config, "\n\t "); - if (strcmp(field, "driver") == 0) { + if (strcmp(field, "driver") == 0 && config != NULL) { struct ibv_driver_name *driver_name; config += strspn(config, "\t "); @@ -362,7 +376,7 @@ if (asprintf(&path, "%s/%s", IBV_CONFIG_DIR, dent->d_name) < 0) { fprintf(stderr, PFX "Warning: couldn't read config file %s/%s.\n", IBV_CONFIG_DIR, dent->d_name); - return; + goto out; } if (stat(path, &buf)) { @@ -379,18 +393,30 @@ free(path); } +out: closedir(conf_dir); } static struct ibv_device *try_driver(struct ibv_driver *driver, struct ibv_sysfs_dev *sysfs_dev) { + struct verbs_device *vdev; struct ibv_device *dev; char value[8]; - dev = driver->init_func(sysfs_dev->sysfs_path, sysfs_dev->abi_ver); - if (!dev) - return NULL; + if (driver->init_func) { + dev = driver->init_func(sysfs_dev->sysfs_path, sysfs_dev->abi_ver); + if (!dev) + return NULL; + } else { + vdev = driver->verbs_init_func(sysfs_dev->sysfs_path, sysfs_dev->abi_ver); + if (!vdev) + return NULL; + + dev = &vdev->device; + dev->ops.alloc_context = NULL; + dev->ops.free_context = NULL; + } if (ibv_read_sysfs_file(sysfs_dev->ibdev_path, "node_type", value, sizeof value) < 0) { fprintf(stderr, PFX "Warning: no node_type attr under %s.\n", @@ -398,10 +424,14 @@ dev->node_type = IBV_NODE_UNKNOWN; } else { dev->node_type = strtol(value, NULL, 10); - if (dev->node_type < IBV_NODE_CA || dev->node_type > IBV_NODE_RNIC) - dev->node_type = IBV_NODE_UNKNOWN; + if (dev->node_type < IBV_EXP_NODE_TYPE_START) { + if (dev->node_type < IBV_NODE_CA || dev->node_type > IBV_NODE_RNIC) + dev->node_type = IBV_NODE_UNKNOWN; + } else { + if (dev->node_type > IBV_EXP_NODE_MIC) + dev->node_type = IBV_NODE_UNKNOWN; + } } -out: switch (dev->node_type) { case IBV_NODE_CA: @@ -412,6 +442,9 @@ case IBV_NODE_RNIC: dev->transport_type = IBV_TRANSPORT_IWARP; break; + case IBV_EXP_NODE_MIC: + dev->transport_type = IBV_EXP_TRANSPORT_SCIF; + break; default: dev->transport_type = IBV_TRANSPORT_UNKNOWN; break; Index: contrib/ofed/libibverbs/src/libibverbs.map =================================================================== --- contrib/ofed/libibverbs/src/libibverbs.map +++ contrib/ofed/libibverbs/src/libibverbs.map @@ -64,6 +64,8 @@ ibv_cmd_destroy_ah; ibv_cmd_attach_mcast; ibv_cmd_detach_mcast; + ibv_cmd_create_flow; + ibv_cmd_destroy_flow; ibv_copy_qp_attr_from_kern; ibv_copy_path_rec_from_kern; ibv_copy_path_rec_to_kern; @@ -83,7 +85,6 @@ ibv_get_device_guid; ibv_open_device; ibv_close_device; - ibv_resolve_eth_gid; ibv_init_ah_from_wc; ibv_create_ah_from_wc; @@ -92,25 +93,50 @@ ibv_dontfork_range; ibv_dofork_range; ibv_register_driver; - ibv_create_xrc_srq; - ibv_cmd_create_xrc_srq; + verbs_register_driver; + + ibv_node_type_str; + ibv_port_state_str; + ibv_event_type_str; + ibv_wc_status_str; + + ibv_cmd_alloc_mw; + ibv_cmd_dealloc_mw; + + ibv_rate_to_mbps; + mbps_to_ibv_rate; + ibv_cmd_open_xrcd; + ibv_cmd_close_xrcd; + ibv_cmd_create_srq_ex; + ibv_cmd_open_qp; ibv_open_xrc_domain; - ibv_cmd_open_xrc_domain; + ibv_create_xrc_srq; ibv_close_xrc_domain; - ibv_cmd_close_xrc_domain; ibv_create_xrc_rcv_qp; - ibv_cmd_create_xrc_rcv_qp; ibv_modify_xrc_rcv_qp; - ibv_cmd_modify_xrc_rcv_qp; - ibv_query_xrc_rcv_qp; - ibv_cmd_query_xrc_rcv_qp; ibv_reg_xrc_rcv_qp; - ibv_cmd_reg_xrc_rcv_qp; ibv_unreg_xrc_rcv_qp; - ibv_cmd_unreg_xrc_rcv_qp; - - ibv_node_type_str; - ibv_port_state_str; - ibv_event_type_str; - ibv_wc_status_str; + ibv_query_xrc_rcv_qp; + ibv_exp_cmd_create_qp; + ibv_exp_cmd_query_device; + ibv_exp_cmd_create_dct; + ibv_exp_cmd_destroy_dct; + ibv_exp_cmd_query_dct; + ibv_exp_cmd_arm_dct; + ibv_exp_cmd_modify_cq; + ibv_exp_cmd_modify_qp; + ibv_exp_cmd_create_cq; + ibv_exp_cmd_create_mr; + ibv_exp_cmd_query_mkey; + ibv_cmd_exp_reg_mr; + ibv_cmd_exp_prefetch_mr; + ibv_exp_cmd_rereg_mr; + ibv_exp_cmd_getenv; + ibv_exp_cmd_create_flow; + ibv_exp_cmd_destroy_flow; + ibv_exp_cmd_create_wq; + ibv_exp_cmd_modify_wq; + ibv_exp_cmd_destroy_wq; + ibv_exp_cmd_create_rwq_ind_table; + ibv_exp_cmd_destroy_rwq_ind_table; } IBVERBS_1.0; Index: contrib/ofed/libibverbs/src/memory.c =================================================================== --- contrib/ofed/libibverbs/src/memory.c +++ contrib/ofed/libibverbs/src/memory.c @@ -38,26 +38,18 @@ #include #include #include +#include #include #include +#include +#include +#include +#include #include "ibverbs.h" -/* - * Most distro's headers don't have these yet. - */ -#ifdef __linux__ -#ifndef MADV_DONTFORK -#define MADV_DONTFORK 10 -#endif - -#ifndef MADV_DOFORK -#define MADV_DOFORK 11 -#endif -#else -#define MADV_DONTFORK INHERIT_NONE -#define MADV_DOFORK INHERIT_SHARE -#endif +#define MADV_DONTFORK INHERIT_NONE +#define MADV_DOFORK INHERIT_SHARE struct ibv_mem_node { enum { @@ -73,14 +65,72 @@ static struct ibv_mem_node *mm_root; static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; static int page_size; +static int huge_page_enabled; static int too_late; +static unsigned long smaps_page_size(FILE *file) +{ + int n; + unsigned long size = page_size; + char buf[1024]; + + while (fgets(buf, sizeof(buf), file) != NULL) { + if (!strstr(buf, "KernelPageSize:")) + continue; + + n = sscanf(buf, "%*s %lu", &size); + if (n < 1) + continue; + + /* page size is printed in Kb */ + size = size * 1024; + + break; + } + + return size; +} + +static unsigned long get_page_size(void *base) +{ + unsigned long ret = page_size; + pid_t pid; + FILE *file; + char buf[1024]; + + pid = getpid(); + snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid); + + file = fopen(buf, "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%" SCNxPTR "-%"SCNxPTR, &range_start, &range_end); + + if (n < 2) + continue; + + if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) { + ret = smaps_page_size(file); + break; + } + } + + fclose(file); + +out: + return ret; +} + int ibv_fork_init(void) { -#ifdef __linux__ - void *tmp; + void *tmp, *tmp_aligned; int ret; -#endif + unsigned long size; if (mm_root) return 0; @@ -92,18 +142,29 @@ if (page_size < 0) return errno; -#ifdef __linux__ if (posix_memalign(&tmp, page_size, page_size)) return ENOMEM; - ret = madvise(tmp, page_size, MADV_DONTFORK) || - madvise(tmp, page_size, MADV_DOFORK); + if (getenv("RDMAV_HUGEPAGES_SAFE")) + huge_page_enabled = 1; + else + huge_page_enabled = 0; + + if (huge_page_enabled) { + size = get_page_size(tmp); + tmp_aligned = (void *)((uintptr_t) tmp & ~(size - 1)); + } else { + size = page_size; + tmp_aligned = tmp; + } + + ret = madvise(tmp_aligned, size, MADV_DONTFORK) || + madvise(tmp_aligned, size, MADV_DOFORK); free(tmp); if (ret) return ENOSYS; -#endif mm_root = malloc(sizeof *mm_root); if (!mm_root) @@ -200,31 +261,6 @@ node->parent = tmp; } -static int verify(struct ibv_mem_node *node) -{ - int hl, hr; - - if (!node) - return 1; - - hl = verify(node->left); - hr = verify(node->left); - - if (!hl || !hr) - return 0; - if (hl != hr) - return 0; - - if (node->color == IBV_RED) { - if (node->left && node->left->color != IBV_BLACK) - return 0; - if (node->right && node->right->color != IBV_BLACK) - return 0; - return hl; - } - - return hl + 1; -} static void __mm_add_rebalance(struct ibv_mem_node *node) { @@ -538,13 +574,19 @@ int inc; int rolling_back = 0; int ret = 0; + unsigned long range_page_size; if (!size) return 0; - start = (uintptr_t) base & ~(page_size - 1); - end = ((uintptr_t) (base + size + page_size - 1) & - ~(page_size - 1)) - 1; + if (huge_page_enabled) + range_page_size = get_page_size(base); + else + range_page_size = page_size; + + start = (uintptr_t) base & ~(range_page_size - 1); + end = ((uintptr_t) (base + size + range_page_size - 1) & + ~(range_page_size - 1)) - 1; pthread_mutex_lock(&mm_mutex); again: @@ -578,10 +620,10 @@ * and that may lead to a spurious failure. */ if (start > node->start) - ret = minherit((void *) start, node->end - start + 1, + ret = madvise((void *) start, node->end - start + 1, advice); else - ret = minherit((void *) node->start, + ret = madvise((void *) node->start, node->end - node->start + 1, advice); if (ret) { Index: contrib/ofed/libibverbs/src/sysfs.c =================================================================== --- contrib/ofed/libibverbs/src/sysfs.c +++ contrib/ofed/libibverbs/src/sysfs.c @@ -80,21 +80,22 @@ int ibv_read_sysfs_file(const char *dir, const char *file, char *buf, size_t size) { - char *path, *s; - int fd; + char *path; + char *ptr; size_t len; if (asprintf(&path, "%s/%s", dir, file) < 0) return -1; - for (s = &path[0]; *s != '\0'; s++) - if (*s == '/') - *s = '.'; - - len = size; - if (sysctlbyname(&path[1], buf, &len, NULL, 0) == -1) + for (ptr = path; *ptr != '\0'; ptr++) { + if (*ptr == '/') + *ptr = '.'; + } + len = size; + if (sysctlbyname(path + 1, buf, &len, NULL, 0) == -1) { + free(path); return -1; - + } free(path); if (len > 0 && buf[len - 1] == '\n') Index: contrib/ofed/libibverbs/src/verbs.c =================================================================== --- contrib/ofed/libibverbs/src/verbs.c +++ contrib/ofed/libibverbs/src/verbs.c @@ -36,13 +36,19 @@ #endif /* HAVE_CONFIG_H */ #include -#include #include #include #include #include +#include +#include +#include #include "ibverbs.h" +#include "infiniband/verbs_exp.h" +#ifndef NRESOLVE_NEIGH +#include "neigh.h" +#endif int ibv_rate_to_mult(enum ibv_rate rate) { @@ -76,6 +82,54 @@ } } +int ibv_rate_to_mbps(enum ibv_rate rate) +{ + switch (rate) { + case IBV_RATE_2_5_GBPS: return 2500; + case IBV_RATE_5_GBPS: return 5000; + case IBV_RATE_10_GBPS: return 10000; + case IBV_RATE_20_GBPS: return 20000; + case IBV_RATE_30_GBPS: return 30000; + case IBV_RATE_40_GBPS: return 40000; + case IBV_RATE_60_GBPS: return 60000; + case IBV_RATE_80_GBPS: return 80000; + case IBV_RATE_120_GBPS: return 120000; + case IBV_RATE_14_GBPS: return 14062; + case IBV_RATE_56_GBPS: return 56250; + case IBV_RATE_112_GBPS: return 112500; + case IBV_RATE_168_GBPS: return 168750; + case IBV_RATE_25_GBPS: return 25781; + case IBV_RATE_100_GBPS: return 103125; + case IBV_RATE_200_GBPS: return 206250; + case IBV_RATE_300_GBPS: return 309375; + default: return -1; + } +} + +enum ibv_rate mbps_to_ibv_rate(int mbps) +{ + switch (mbps) { + case 2500: return IBV_RATE_2_5_GBPS; + case 5000: return IBV_RATE_5_GBPS; + case 10000: return IBV_RATE_10_GBPS; + case 20000: return IBV_RATE_20_GBPS; + case 30000: return IBV_RATE_30_GBPS; + case 40000: return IBV_RATE_40_GBPS; + case 60000: return IBV_RATE_60_GBPS; + case 80000: return IBV_RATE_80_GBPS; + case 120000: return IBV_RATE_120_GBPS; + case 14062: return IBV_RATE_14_GBPS; + case 56250: return IBV_RATE_56_GBPS; + case 112500: return IBV_RATE_112_GBPS; + case 168750: return IBV_RATE_168_GBPS; + case 25781: return IBV_RATE_25_GBPS; + case 103125: return IBV_RATE_100_GBPS; + case 206250: return IBV_RATE_200_GBPS; + case 309375: return IBV_RATE_300_GBPS; + default: return IBV_RATE_MAX; + } +} + int __ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { @@ -154,36 +208,120 @@ } default_symver(__ibv_dealloc_pd, ibv_dealloc_pd); -struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, - size_t length, int access) + +struct ibv_mr *__ibv_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in) +{ + struct verbs_context_exp *ctx = verbs_get_exp_ctx(in->pd->context); + struct ibv_mr *mr; + + if (!ctx->drv_exp_ibv_reg_shared_mr) { + errno = ENOSYS; + return NULL; + } + + mr = ctx->drv_exp_ibv_reg_shared_mr(in); + if (mr) { + if (ibv_dontfork_range(mr->addr, mr->length)) { + /* dereg_mr without its internal dofork */ + mr->context->ops.dereg_mr(mr); + return NULL; + } + } + + return mr; +} + +struct ibv_mr *__ibv_common_reg_mr(struct ibv_exp_reg_mr_in *in, + struct verbs_context_exp *context_exp) { struct ibv_mr *mr; + int is_contig; + int is_odp; - if (ibv_dontfork_range(addr, length)) + if ((in->exp_access & IBV_EXP_ACCESS_ALLOCATE_MR) && in->addr != NULL) return NULL; - mr = pd->context->ops.reg_mr(pd, addr, length, access); + is_contig = !!((in->exp_access & IBV_EXP_ACCESS_ALLOCATE_MR) || + ((in->comp_mask & IBV_EXP_REG_MR_CREATE_FLAGS) && + (in->create_flags & IBV_EXP_REG_MR_CREATE_CONTIG))); + + is_odp = !!(in->exp_access & IBV_EXP_ACCESS_ON_DEMAND); + /* fork support for contig is handled by the provider, for odp no special code is needed */ + if (!is_odp && !is_contig) { + if (ibv_dontfork_range(in->addr, in->length)) + return NULL; + } + if (context_exp) + mr = context_exp->drv_exp_reg_mr(in); + else + mr = in->pd->context->ops.reg_mr(in->pd, in->addr, in->length, + in->exp_access); if (mr) { - mr->context = pd->context; - mr->pd = pd; - mr->addr = addr; - mr->length = length; - } else - ibv_dofork_range(addr, length); + mr->context = in->pd->context; + mr->pd = in->pd; + if (in->exp_access & IBV_EXP_ACCESS_ALLOCATE_MR) + ; + /* In case that internal allocator was used + addr already set internally + */ + else if (!(in->exp_access & IBV_EXP_ACCESS_RELAXED)) + mr->addr = in->addr; + if (!(in->exp_access & IBV_EXP_ACCESS_RELAXED)) + mr->length = in->length; + } else if (!is_odp && !is_contig) { + ibv_dofork_range(in->addr, in->length); + } return mr; } + +struct ibv_mr *__ibv_exp_reg_mr(struct ibv_exp_reg_mr_in *in) +{ + struct verbs_context_exp *ctx = verbs_get_exp_ctx(in->pd->context); + + if (!ctx->drv_exp_reg_mr) { + errno = ENOSYS; + return NULL; + } + return __ibv_common_reg_mr(in, ctx); +} + +struct ibv_mr *__ibv_reg_mr(struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + struct ibv_exp_reg_mr_in in; + + memset(&in, 0, sizeof(in)); + in.pd = pd; + in.addr = addr; + in.length = length; + in.exp_access = access; + + return __ibv_common_reg_mr(&in, NULL); +} default_symver(__ibv_reg_mr, ibv_reg_mr); int __ibv_dereg_mr(struct ibv_mr *mr) { int ret; + struct verbs_context_exp *vctx; + struct ibv_exp_dereg_out out; void *addr = mr->addr; size_t length = mr->length; - ret = mr->context->ops.dereg_mr(mr); - if (!ret) - ibv_dofork_range(addr, length); + memset(&out, 0, sizeof(out)); + out.need_dofork = 1; + + vctx = verbs_get_exp_ctx_op(mr->context, drv_exp_dereg_mr); + if (vctx) + ret = vctx->drv_exp_dereg_mr(mr, &out); + else + ret = mr->context->ops.dereg_mr(mr); + + if (!ret) { + if (out.need_dofork) + ibv_dofork_range(addr, length); + } return ret; } @@ -366,9 +504,6 @@ srq->context = pd->context; srq->srq_context = srq_init_attr->srq_context; srq->pd = pd; - srq->xrc_domain = NULL; - srq->xrc_cq = NULL; - srq->xrc_srq_num = 0; srq->events_completed = 0; pthread_mutex_init(&srq->mutex, NULL); pthread_cond_init(&srq->cond, NULL); @@ -378,32 +513,6 @@ } default_symver(__ibv_create_srq, ibv_create_srq); -struct ibv_srq *ibv_create_xrc_srq(struct ibv_pd *pd, - struct ibv_xrc_domain *xrc_domain, - struct ibv_cq *xrc_cq, - struct ibv_srq_init_attr *srq_init_attr) -{ - struct ibv_srq *srq; - - if (!pd->context->more_ops) - return NULL; - - srq = pd->context->more_ops->create_xrc_srq(pd, xrc_domain, - xrc_cq, srq_init_attr); - if (srq) { - srq->context = pd->context; - srq->srq_context = srq_init_attr->srq_context; - srq->pd = pd; - srq->xrc_domain = xrc_domain; - srq->xrc_cq = xrc_cq; - srq->events_completed = 0; - pthread_mutex_init(&srq->mutex, NULL); - pthread_cond_init(&srq->cond, NULL); - } - - return srq; -} - int __ibv_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask) @@ -439,8 +548,6 @@ qp->qp_type = qp_init_attr->qp_type; qp->state = IBV_QPS_RESET; qp->events_completed = 0; - qp->xrc_domain = qp_init_attr->qp_type == IBV_QPT_XRC ? - qp_init_attr->xrc_domain : NULL; pthread_mutex_init(&qp->mutex, NULL); pthread_cond_init(&qp->cond, NULL); } @@ -488,38 +595,429 @@ } default_symver(__ibv_destroy_qp, ibv_destroy_qp); +#ifndef s6_addr32 +#define s6_addr32 __u6_addr.__u6_addr32 +#endif + +static inline int ipv6_addr_v4mapped(const struct in6_addr *a) +{ + return ((a->s6_addr32[0] | a->s6_addr32[1]) | + (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL || + /* IPv4 encoded multicast addresses */ + (a->s6_addr32[0] == htonl(0xff0e0000) && + ((a->s6_addr32[1] | + (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL)); +} + + +struct peer_address { + void *address; + uint32_t size; +}; + +static inline int create_peer_from_gid(int family, void *raw_gid, + struct peer_address *peer_address) +{ + switch (family) { + case AF_INET: + peer_address->address = raw_gid + 12; + peer_address->size = 4; + break; + case AF_INET6: + peer_address->address = raw_gid; + peer_address->size = 16; + break; + default: + return -1; + } + + return 0; +} + +#define ETHERNET_LL_SIZE 6 +#define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000 struct ibv_ah *__ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { - struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr); + struct ibv_ah *ah = NULL; +#ifndef NRESOLVE_NEIGH + int err; + struct ibv_exp_port_attr port_attr; + int dst_family; + int src_family; + int oif; + struct get_neigh_handler neigh_handler; + union ibv_gid sgid; + struct ibv_exp_ah_attr attr_ex; + char ethernet_ll[ETHERNET_LL_SIZE]; + struct verbs_context_exp *vctx = verbs_get_exp_ctx_op(pd->context, + drv_exp_ibv_create_ah); + struct peer_address src; + struct peer_address dst; + + if (!vctx) { +#endif + ah = pd->context->ops.create_ah(pd, attr); +#ifndef NRESOLVE_NEIGH + goto return_ah; + } + + port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1; + port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER; + err = ibv_exp_query_port(pd->context, attr->port_num, &port_attr); + + if (err) { + fprintf(stderr, PFX "ibv_create_ah failed to query port.\n"); + return NULL; + } + + if ((IBV_LINK_LAYER_ETHERNET == port_attr.link_layer) && + !attr->is_global) { + fprintf(stderr, PFX "GRH is mandatory For RoCE address handle\n"); + return NULL; + } + if (IBV_LINK_LAYER_ETHERNET != port_attr.link_layer) { + ah = pd->context->ops.create_ah(pd, attr); + goto return_ah; + } + + memset(&attr_ex, 0, sizeof(attr_ex)); + + memcpy(&attr_ex, attr, sizeof(*attr)); + memset((void *)&attr_ex + sizeof(*attr), 0, + sizeof(attr_ex) - sizeof(*attr)); + + err = ibv_query_gid(pd->context, attr->port_num, + attr->grh.sgid_index, &sgid); + if (err) { + fprintf(stderr, PFX "ibv_create_ah failed to query sgid.\n"); + return NULL; + } + + if (neigh_init_resources(&neigh_handler, NEIGH_GET_DEFAULT_TIMEOUT_MS)) + return NULL; + + dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ? + AF_INET : AF_INET6; + src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ? + AF_INET : AF_INET6; + + if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst)) { + fprintf(stderr, PFX "ibv_create_ah failed to create dst " + "peer\n"); + goto free_resources; + } + if (create_peer_from_gid(src_family, &sgid.raw, &src)) { + fprintf(stderr, PFX "ibv_create_ah failed to create src " + "peer\n"); + goto free_resources; + } + if (neigh_set_dst(&neigh_handler, dst_family, dst.address, + dst.size)) { + fprintf(stderr, PFX "ibv_create_ah failed to create dst " + "addr\n"); + goto free_resources; + } + + if (neigh_set_src(&neigh_handler, src_family, src.address, + src.size)) { + fprintf(stderr, PFX "ibv_create_ah failed to create src " + "addr\n"); + goto free_resources; + } + + oif = neigh_get_oif_from_src(&neigh_handler); + + if (oif > 0) { + neigh_set_oif(&neigh_handler, oif); + } else { + fprintf(stderr, PFX "ibv_create_ah failed to get output IF\n"); + goto free_resources; + } + + + /* blocking call */ + if (process_get_neigh(&neigh_handler)) { + fprintf(stderr, PFX "Neigh resolution process failed\n"); + goto free_resources; + } + + attr_ex.vid = neigh_get_vlan_id_from_dev(&neigh_handler); + + if (attr_ex.vid <= 0xfff) { + neigh_set_vlan_id(&neigh_handler, attr_ex.vid); + attr_ex.comp_mask |= IBV_EXP_AH_ATTR_VID; + } + /* We are using only ethernet here */ + attr_ex.ll_address.len = neigh_get_ll(&neigh_handler, ethernet_ll, + sizeof(ethernet_ll)); + + if (attr_ex.ll_address.len <= 0) + goto free_resources; + + attr_ex.comp_mask |= IBV_EXP_AH_ATTR_LL; + attr_ex.ll_address.type = LL_ADDRESS_ETH; + attr_ex.ll_address.address = ethernet_ll; + + + ah = vctx->drv_exp_ibv_create_ah(pd, &attr_ex); + +free_resources: + neigh_free_resources(&neigh_handler); + +return_ah: +#endif if (ah) { ah->context = pd->context; ah->pd = pd; } - return ah; } default_symver(__ibv_create_ah, ibv_create_ah); static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num, - union ibv_gid *gid) + union ibv_gid *gid, uint32_t gid_type) { + struct ibv_exp_gid_attr gid_attr; union ibv_gid sgid; int i = 0, ret; + gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE; + do { - ret = ibv_query_gid(context, port_num, i++, &sgid); - } while (!ret && memcmp(&sgid, gid, sizeof *gid)); + ret = ibv_query_gid(context, port_num, i, &sgid); + if (!ret) + ret = ibv_exp_query_gid_attr(context, port_num, i, + &gid_attr); + i++; + } while (!ret && (memcmp(&sgid, gid, sizeof *gid) || (gid_type != gid_attr.type))); return ret ? ret : i - 1; } +/* + * The functions ipv6_addr_set() and ipv6_addr_set_v4mapped() were copied from + * Linux/lib/checksum.c with the following license notice + * + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. +*/ + +static inline void ipv6_addr_set(struct in6_addr *addr, + __be32 w1, __be32 w2, + __be32 w3, __be32 w4) +{ + addr->s6_addr32[0] = w1; + addr->s6_addr32[1] = w2; + addr->s6_addr32[2] = w3; + addr->s6_addr32[3] = w4; +} + +static inline void ipv6_addr_set_v4mapped(const __be32 addr, + struct in6_addr *v4mapped) +{ + ipv6_addr_set(v4mapped, + 0, 0, + htonl(0x0000FFFF), + addr); +} + +/* The functions do_csum() and from32to16() were copied from Linux/lib/checksum.c + * with the following license notice + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, + * Arnt Gulbrandsen, + * Tom May, + * Andreas Schwab, + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: + * Fixed some nasty bugs, causing some horrible crashes. + * A: At some points, the sum (%0) was used as + * length-counter instead of the length counter + * (%1). Thanks to Roman Hodek for pointing this out. + * B: GCC seems to mess up if one uses too many + * data-registers to hold input values and one tries to + * specify d0 and d1 as scratch registers. Letting gcc + * choose these registers itself solves the problem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access + kills, so most of the assembly has to go. */ + +static inline unsigned short from32to16(unsigned long x) +{ + /* add up 16-bit and 16-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +static unsigned int do_csum(const unsigned char *buff, int len) +{ + int odd, count; + unsigned int result = 0; + + if (len <= 0) + goto out; + odd = 1 & (unsigned long) buff; + if (odd) { +#if BYTE_ORDER == LITTLE_ENDIAN + result += (*buff << 8); +#else + result = *buff; +#endif + len--; + buff++; + } + count = len >> 1; /* nr of 16-bit words.. */ + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *)buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + unsigned int carry = 0; + do { + unsigned int w = *(unsigned int *)buff; + count--; + buff += 4; + result += carry; + result += w; + carry = (w > result); + } while (count); + result += carry; + result = (result & 0xffff) + (result >> 16); + } + if (len & 2) { + result += *(unsigned short *)buff; + buff += 2; + } + } + if (len & 1) +#if BYTE_ORDER == LITTLE_ENDIAN + result += *buff; +#else + result += (*buff << 8); +#endif + result = from32to16(result); + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +out: + return result; +} + +uint16_t ip_fast_csum(const void *iph, unsigned int ihl) +{ + return ~do_csum(iph, ihl*4); +} + +struct iphdr { +#if BYTE_ORDER == LITTLE_ENDIAN + __u8 ihl:4, + version:4; +#else + __u8 version:4, + ihl:4; +#endif + __u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + __u8 ttl; + __u8 protocol; + __be16 check; + __u32 saddr; + __u32 daddr; +}; + +struct ipv6hdr { +#if BYTE_ORDER == LITTLE_ENDIAN + __u8 priority:4, + version:4; +#else + __u8 version:4, + priority:4; +#endif + __u8 flow_lbl[3]; + + __be16 payload_len; + __u8 nexthdr; + __u8 hop_limit; + + struct in6_addr saddr; + struct in6_addr daddr; +}; + +int get_grh_header_version(void *h) +{ + struct iphdr *ip4h = (struct iphdr *)(h + 20); + struct iphdr ip4h_checked; + struct ipv6hdr *ip6h = (struct ipv6hdr *)h; + + if (ip6h->version != 6) + return (ip4h->version == 4) ? 4 : 0; + /* version may be 6 or 4 */ + if (ip4h->ihl != 5) /* IPv4 header length must be 5 for RR */ + return 6; + /* Verify checksum. + We can't write on scattered buffers so we need to copy to temp buffer */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.check = 0; + ip4h_checked.check = ip_fast_csum((uint8_t *)&ip4h_checked, 5); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->check == ip4h_checked.check) + return 4; + return 6; +} + +#define CLASS_D_ADDR (0xeUL << 28) +#define CLASS_D_MASK (0xfUL << 28) int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, struct ibv_wc *wc, struct ibv_grh *grh, struct ibv_ah_attr *ah_attr) { + union { + union ibv_gid gid; + struct in6_addr addr; + } sgid; uint32_t flow_class; int ret; + struct iphdr *iph = (struct iphdr *)((void *)grh + 20); + int version; + int is_eth; + struct ibv_exp_port_attr port_attr; + uint32_t gid_type; + + port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1; + port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER; + ret = ibv_exp_query_port(context, port_num, &port_attr); + if (ret) + return ret; + is_eth = (IBV_LINK_LAYER_ETHERNET == port_attr.link_layer); memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = wc->slid; @@ -529,17 +1027,56 @@ if (wc->wc_flags & IBV_WC_GRH) { ah_attr->is_global = 1; - ah_attr->grh.dgid = grh->sgid; - - ret = ibv_find_gid_index(context, port_num, &grh->dgid); - if (ret < 0) - return ret; - - ah_attr->grh.sgid_index = (uint8_t) ret; - flow_class = ntohl(grh->version_tclass_flow); - ah_attr->grh.flow_label = flow_class & 0xFFFFF; - ah_attr->grh.hop_limit = grh->hop_limit; - ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + if (is_eth) + version = get_grh_header_version(grh); + else + version = 6; + if (version == 4) { + if (((ntohl)(iph->daddr) & CLASS_D_MASK) == CLASS_D_ADDR) + return EINVAL; + + if (iph->protocol == IPPROTO_UDP) + gid_type = IBV_EXP_ROCE_V2_GID_TYPE; + else + gid_type = IBV_EXP_ROCE_V1_5_GID_TYPE; + + ipv6_addr_set_v4mapped(iph->saddr, + (struct in6_addr *)&ah_attr->grh.dgid); + ipv6_addr_set_v4mapped(iph->daddr, (struct in6_addr *)&sgid.addr); + ret = ibv_find_gid_index(context, port_num, &sgid.gid, gid_type); + if (ret < 0) + return ret; + + ah_attr->grh.sgid_index = (uint8_t) ret; + + ah_attr->grh.flow_label = iph->id & 0xfffff; + ah_attr->grh.hop_limit = iph->ttl; + ah_attr->grh.traffic_class = iph->tos; + } else if (version == 6) { + ah_attr->grh.dgid = grh->sgid; + if (grh->dgid.raw[0] == 0xFF) + return EINVAL; + + if (grh->next_hdr == IPPROTO_UDP) + gid_type = IBV_EXP_ROCE_V2_GID_TYPE; + else if (grh->next_hdr == 0x1b) + gid_type = IBV_EXP_IB_ROCE_V1_GID_TYPE; + else + gid_type = IBV_EXP_ROCE_V1_5_GID_TYPE; + + ret = ibv_find_gid_index(context, port_num, &grh->dgid, gid_type); + if (ret < 0) + return ret; + + ah_attr->grh.sgid_index = (uint8_t) ret; + flow_class = ntohl(grh->version_tclass_flow); + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->grh.hop_limit = grh->hop_limit; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + } else { + errno = EPROTONOSUPPORT; + return EPROTONOSUPPORT; + } } return 0; } @@ -575,180 +1112,340 @@ } default_symver(__ibv_detach_mcast, ibv_detach_mcast); + +/* XRC compatability layer */ struct ibv_xrc_domain *ibv_open_xrc_domain(struct ibv_context *context, int fd, int oflag) { - struct ibv_xrc_domain *d; - if (!context->more_ops) + struct ibv_xrcd *ibv_xrcd; + struct ibv_xrcd_init_attr xrcd_init_attr; + + memset(&xrcd_init_attr, 0, sizeof(xrcd_init_attr)); + + xrcd_init_attr.fd = fd; + xrcd_init_attr.oflags = oflag; + + xrcd_init_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | + IBV_XRCD_INIT_ATTR_OFLAGS; + + ibv_xrcd = ibv_open_xrcd(context, &xrcd_init_attr); + if (!ibv_xrcd) return NULL; - d = context->more_ops->open_xrc_domain(context, fd, oflag); - if (d) - d->context = context; + /* Caller should relate this returned pointer as an opaque, internally will be used + * as ibv_xrcd pointer. + */ + return (struct ibv_xrc_domain *)ibv_xrcd; - return d; } -int ibv_close_xrc_domain(struct ibv_xrc_domain *d) + +struct ibv_srq *ibv_create_xrc_srq(struct ibv_pd *pd, + struct ibv_xrc_domain *xrc_domain, + struct ibv_cq *xrc_cq, + struct ibv_srq_init_attr *srq_init_attr) { - if (!d->context->more_ops) - return 0; - return d->context->more_ops->close_xrc_domain(d); -} + struct ibv_srq_init_attr_ex ibv_srq_init_attr_ex; + struct ibv_srq_legacy *ibv_srq_legacy; + struct ibv_srq *ibv_srq; + uint32_t xrc_srq_num; + struct verbs_context_exp *vctx; -int ibv_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_rcv_qpn) -{ - struct ibv_context *c; - if (!init_attr || !(init_attr->xrc_domain)) - return EINVAL; + vctx = verbs_get_exp_ctx_op(pd->context, drv_exp_set_legacy_xrc); + if (!vctx) { + errno = ENOSYS; + return NULL; + } + memset(&ibv_srq_init_attr_ex, 0, sizeof ibv_srq_init_attr_ex); - c = init_attr->xrc_domain->context; - if (!c->more_ops) - return ENOSYS; + ibv_srq_init_attr_ex.xrcd = (struct ibv_xrcd *)xrc_domain; + ibv_srq_init_attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_XRCD | + IBV_SRQ_INIT_ATTR_TYPE | + IBV_SRQ_INIT_ATTR_CQ | IBV_SRQ_INIT_ATTR_PD; - return c->more_ops->create_xrc_rcv_qp(init_attr, - xrc_rcv_qpn); -} + ibv_srq_init_attr_ex.cq = xrc_cq; + ibv_srq_init_attr_ex.pd = pd; + ibv_srq_init_attr_ex.srq_type = IBV_SRQT_XRC; -int ibv_modify_xrc_rcv_qp(struct ibv_xrc_domain *d, - uint32_t xrc_rcv_qpn, - struct ibv_qp_attr *attr, - int attr_mask) -{ - if (!d || !attr) - return EINVAL; + ibv_srq_init_attr_ex.attr.max_sge = srq_init_attr->attr.max_sge; + ibv_srq_init_attr_ex.attr.max_wr = srq_init_attr->attr.max_wr; + ibv_srq_init_attr_ex.attr.srq_limit = srq_init_attr->attr.srq_limit; + ibv_srq_init_attr_ex.srq_context = srq_init_attr->srq_context; - if (!d->context->more_ops) - return ENOSYS; + ibv_srq = ibv_create_srq_ex(pd->context, &ibv_srq_init_attr_ex); + if (!ibv_srq) + return NULL; + + /* handle value LEGACY_XRC_SRQ_HANDLE should be reserved, in case got it + * allocating other one, than free it to in order to get a new handle. + */ + if (ibv_srq->handle == LEGACY_XRC_SRQ_HANDLE) { + + struct ibv_srq *ibv_srq_tmp = ibv_srq; + int ret; + + ibv_srq = ibv_create_srq_ex(pd->context, &ibv_srq_init_attr_ex); + /* now destroying previous one */ + ret = ibv_destroy_srq(ibv_srq_tmp); + if (ret) { + fprintf(stderr, PFX "ibv_create_xrc_srq, fail to destroy intermediate srq\n"); + return NULL; + } + + if (!ibv_srq) + return NULL; + + /* still get this value - set an error */ + if (ibv_srq->handle == LEGACY_XRC_SRQ_HANDLE) { + ret = ibv_destroy_srq(ibv_srq); + if (ret) + fprintf(stderr, PFX "ibv_create_xrc_srq, fail to destroy intermediate srq\n"); + errno = EAGAIN; + return NULL; + } + } + + ibv_srq_legacy = calloc(1, sizeof(*ibv_srq_legacy)); + if (!ibv_srq_legacy) { + errno = ENOMEM; + goto err; + } + + if (ibv_get_srq_num(ibv_srq, &xrc_srq_num)) + goto err_free; + + ibv_srq_legacy->ibv_srq = ibv_srq; + ibv_srq_legacy->xrc_srq_num = xrc_srq_num; + + /* setting the bin compat fields */ + ibv_srq_legacy->xrc_srq_num_bin_compat = xrc_srq_num; + ibv_srq_legacy->xrc_domain_bin_compat = xrc_domain; + ibv_srq_legacy->xrc_cq_bin_compat = xrc_cq; + ibv_srq_legacy->context = pd->context; + ibv_srq_legacy->srq_context = srq_init_attr->srq_context; + ibv_srq_legacy->pd = pd; + /* Set an indication that this is a legacy structure. + * In all cases that we have this indication should use internal ibv_srq having real handle and fields. + * + */ + ibv_srq_legacy->handle = LEGACY_XRC_SRQ_HANDLE; + ibv_srq_legacy->xrc_domain = xrc_domain; + ibv_srq_legacy->xrc_cq = xrc_cq; + /* mutex & cond are not set on legacy_ibv_srq, internal ones are used. + * We don't expect application to use them. + */ + ibv_srq_legacy->events_completed = 0; + + vctx->drv_exp_set_legacy_xrc(ibv_srq, ibv_srq_legacy); + return (struct ibv_srq *)(ibv_srq_legacy); + +err_free: + free(ibv_srq_legacy); +err: + ibv_destroy_srq(ibv_srq); + return NULL; - return d->context->more_ops->modify_xrc_rcv_qp(d, xrc_rcv_qpn, attr, - attr_mask); } -int ibv_query_xrc_rcv_qp(struct ibv_xrc_domain *d, - uint32_t xrc_rcv_qpn, - struct ibv_qp_attr *attr, - int attr_mask, - struct ibv_qp_init_attr *init_attr) + + +static pthread_mutex_t xrc_tree_mutex = PTHREAD_MUTEX_INITIALIZER; +static void *ibv_xrc_qp_tree; + +static int xrc_qp_compare(const void *a, const void *b) { - if (!d) - return EINVAL; - if (!d->context->more_ops) - return ENOSYS; + if ((*(uint32_t *) a) < (*(uint32_t *) b)) + return -1; + else if ((*(uint32_t *) a) > (*(uint32_t *) b)) + return 1; + else + return 0; - return d->context->more_ops->query_xrc_rcv_qp(d, xrc_rcv_qpn, attr, - attr_mask, init_attr); } -int ibv_reg_xrc_rcv_qp(struct ibv_xrc_domain *d, - uint32_t xrc_rcv_qpn) +struct ibv_qp *ibv_find_xrc_qp(uint32_t qpn) { - return d->context->more_ops->reg_xrc_rcv_qp(d, xrc_rcv_qpn); + uint32_t **qpn_ptr; + struct ibv_qp *ibv_qp = NULL; + + pthread_mutex_lock(&xrc_tree_mutex); + qpn_ptr = tfind(&qpn, &ibv_xrc_qp_tree, xrc_qp_compare); + if (!qpn_ptr) + goto end; + + ibv_qp = container_of(*qpn_ptr, struct ibv_qp, qp_num); + +end: + pthread_mutex_unlock(&xrc_tree_mutex); + return ibv_qp; } -int ibv_unreg_xrc_rcv_qp(struct ibv_xrc_domain *d, - uint32_t xrc_rcv_qpn) +static int ibv_clear_xrc_qp(uint32_t qpn) { - return d->context->more_ops->unreg_xrc_rcv_qp(d, xrc_rcv_qpn); -} + uint32_t **qpn_ptr; + int ret = 0; + pthread_mutex_lock(&xrc_tree_mutex); + qpn_ptr = tdelete(&qpn, &ibv_xrc_qp_tree, xrc_qp_compare); + if (!qpn_ptr) + ret = EINVAL; -static uint16_t get_vlan_id(const union ibv_gid *dgid) -{ - return dgid->raw[11] << 8 | dgid->raw[12]; + pthread_mutex_unlock(&xrc_tree_mutex); + return ret; } -static void get_ll_mac(const union ibv_gid *gid, uint8_t *mac) +static int ibv_store_xrc_qp(struct ibv_qp *qp) { - memcpy(mac, &gid->raw[8], 3); - memcpy(mac + 3, &gid->raw[13], 3); - mac[0] ^= 2; + uint32_t **qpn_ptr; + int ret = 0; + + if (ibv_find_xrc_qp(qp->qp_num)) { + /* set an error in case qpn alreday exists, not expected to happen */ + fprintf(stderr, PFX "ibv_store_xrc_qp failed, qpn=%u is already stored\n", + qp->qp_num); + return EEXIST; + } + + pthread_mutex_lock(&xrc_tree_mutex); + qpn_ptr = tsearch(&qp->qp_num, &ibv_xrc_qp_tree, xrc_qp_compare); + if (!qpn_ptr) + ret = EINVAL; + + pthread_mutex_unlock(&xrc_tree_mutex); + return ret; + } -static int is_multicast_gid(const union ibv_gid *gid) +int ibv_close_xrc_domain(struct ibv_xrc_domain *d) { - return gid->raw[0] == 0xff; + struct ibv_xrcd *ibv_xrcd = (struct ibv_xrcd *)d; + return ibv_close_xrcd(ibv_xrcd); } -static void get_mcast_mac(const union ibv_gid *gid, uint8_t *mac) +int ibv_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, + uint32_t *xrc_rcv_qpn) { - int i; + struct ibv_xrcd *ibv_xrcd; + struct ibv_qp_init_attr_ex qp_init_attr_ex; + struct ibv_qp *ibv_qp; + int ret; - mac[0] = 0x33; - mac[1] = 0x33; - for (i = 2; i < 6; ++i) - mac[i] = gid->raw[i + 10]; -} + if (!init_attr || !(init_attr->xrc_domain)) + return EINVAL; -static int is_link_local_gid(const union ibv_gid *gid) -{ - uint32_t hi = *(uint32_t *)(gid->raw); - uint32_t lo = *(uint32_t *)(gid->raw + 4); - if (hi == htonl(0xfe800000) && lo == 0) - return 1; + ibv_xrcd = (struct ibv_xrcd *) init_attr->xrc_domain; + memset(&qp_init_attr_ex, 0, sizeof(qp_init_attr_ex)); + qp_init_attr_ex.qp_type = IBV_QPT_XRC_RECV; + qp_init_attr_ex.comp_mask = IBV_QP_INIT_ATTR_XRCD; + qp_init_attr_ex.xrcd = ibv_xrcd; + + ibv_qp = ibv_create_qp_ex(ibv_xrcd->context, &qp_init_attr_ex); + if (!ibv_qp) + return errno; + + /* We should return xrc_rcv_qpn and manage the handle */ + *xrc_rcv_qpn = ibv_qp->qp_num; + ret = ibv_store_xrc_qp(ibv_qp); + if (ret) { + int err; + err = ibv_destroy_qp(ibv_qp); + if (err) + fprintf(stderr, PFX "ibv_create_xrc_rcv_qp, ibv_destroy_qp failed, err=%d\n", err); + return ret; + } return 0; } -static int resolve_gid(const union ibv_gid *dgid, uint8_t *mac, uint8_t *is_mcast) + +int ibv_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, + uint32_t xrc_qp_num, + struct ibv_qp_attr *attr, int attr_mask) { - if (is_link_local_gid(dgid)) { - get_ll_mac(dgid, mac); - *is_mcast = 0; - } else if (is_multicast_gid(dgid)) { - get_mcast_mac(dgid, mac); - *is_mcast = 1; - } else - return -EINVAL; + struct ibv_qp *qp; + + qp = ibv_find_xrc_qp(xrc_qp_num); + if (!qp) + return EINVAL; + + /* no use of xrc doamin */ + return ibv_modify_qp(qp, attr, attr_mask); - return 0; } -static int is_tagged_vlan(const union ibv_gid *gid) +int ibv_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num, + struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr) { - uint16_t tag; + struct ibv_qp *qp; - tag = gid->raw[11] << 8 | gid->raw[12]; + qp = ibv_find_xrc_qp(xrc_qp_num); + if (!qp) + return EINVAL; - return tag < 0x1000; -} + /* no use of xrc doamin */ + return ibv_query_qp(qp, attr, attr_mask, init_attr); -int __ibv_resolve_eth_gid(const struct ibv_pd *pd, uint8_t port_num, - union ibv_gid *dgid, uint8_t sgid_index, - uint8_t mac[], uint16_t *vlan, uint8_t *tagged, - uint8_t *is_mcast) +} +int ibv_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, uint32_t xrc_qp_num) { - int err; - union ibv_gid sgid; - int stagged, svlan; - err = resolve_gid(dgid, mac, is_mcast); - if (err) - return err; + struct ibv_qp *qp; + struct ibv_qp_open_attr attr; + struct ibv_xrcd *ibv_xrcd = (struct ibv_xrcd *)xrc_domain; + int ret; - err = ibv_query_gid(pd->context, port_num, sgid_index, &sgid); - if (err) - return err; + memset(&attr, '\0', sizeof(attr)); + + attr.qp_num = xrc_qp_num; + attr.qp_type = IBV_QPT_XRC_RECV; + attr.xrcd = ibv_xrcd; + attr.comp_mask = IBV_QP_OPEN_ATTR_XRCD | IBV_QP_OPEN_ATTR_NUM | + IBV_QP_OPEN_ATTR_TYPE; + + qp = ibv_open_qp(ibv_xrcd->context, &attr); + if (!qp) + return errno; + /* xrc_qp_num should be equal to qp->qp_num - same kernel qp. + * This API expects to be called from other process comparing the creator one + * No mapping between same qpn to more that 1 ibv_qp pointer. + */ + ret = ibv_store_xrc_qp(qp); + if (ret) { + int err; + err = ibv_destroy_qp(qp); + if (err) + fprintf(stderr, PFX "ibv_reg_xrc_rcv_qp, ibv_destroy_qp failed, err=%d\n", err); - stagged = is_tagged_vlan(&sgid); - if (stagged) { - if (!is_tagged_vlan(dgid) && !is_mcast) - return -1; + return ret; + } - svlan = get_vlan_id(&sgid); - if (svlan != get_vlan_id(dgid) && !is_mcast) - return -1; + return 0; + +} - *tagged = 1; - *vlan = svlan; - } else - *tagged = 0; +int ibv_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, + uint32_t xrc_qp_num) +{ + + struct ibv_qp *qp; + int ret; + + qp = ibv_find_xrc_qp(xrc_qp_num); + if (!qp) + return EINVAL; + + ret = ibv_clear_xrc_qp(xrc_qp_num); + if (ret) { + fprintf(stderr, PFX "ibv_unreg_xrc_rcv_qp, fail via clear, qpn=%u, err=%d\n", + xrc_qp_num, ret); + return ret; + } + + return ibv_destroy_qp(qp); - return 0; } -default_symver(__ibv_resolve_eth_gid, ibv_resolve_eth_gid); Index: contrib/ofed/usr.lib/libibverbs/Makefile =================================================================== --- contrib/ofed/usr.lib/libibverbs/Makefile +++ contrib/ofed/usr.lib/libibverbs/Makefile @@ -14,23 +14,37 @@ SHLIB_MAJOR= 1 MK_PROFILE= no -SRCS= device.c init.c marshall.c verbs.c cmd.c enum_strs.c kern_abi.h \ - memory.c compat-1_0.c sysfs.c - -MAN= ibv_alloc_pd.3 ibv_asyncwatch.1 ibv_attach_mcast.3 ibv_create_ah.3 \ - ibv_create_ah_from_wc.3 ibv_create_comp_channel.3 ibv_create_cq.3 \ - ibv_create_qp.3 ibv_create_srq.3 ibv_devices.1 ibv_devinfo.1 \ - ibv_event_type_str.3 ibv_fork_init.3 ibv_get_async_event.3 \ - ibv_get_cq_event.3 ibv_get_device_guid.3 ibv_get_device_list.3 \ - ibv_get_device_name.3 ibv_modify_qp.3 ibv_modify_srq.3 \ - ibv_open_device.3 ibv_poll_cq.3 ibv_post_recv.3 ibv_post_send.3 \ - ibv_post_srq_recv.3 ibv_query_device.3 ibv_query_gid.3 \ - ibv_query_pkey.3 ibv_query_port.3 ibv_query_qp.3 ibv_query_srq.3 \ - ibv_rate_to_mult.3 ibv_rc_pingpong.1 ibv_reg_mr.3 ibv_req_notify_cq.3 \ - ibv_resize_cq.3 ibv_srq_pingpong.1 ibv_uc_pingpong.1 ibv_ud_pingpong.1 \ - ibv_query_xrc_rcv_qp.3 ibv_reg_xrc_rcv_qp.3 ibv_modify_xrc_rcv_qp.3 \ - verbs.7 ibv_create_xrc_rcv_qp.3 ibv_open_xrc_domain.3 - +SRCS= \ +cmd.c cmd_exp.c compat-1_0.c device.c enum_strs.c init.c marshall.c \ +memory.c sysfs.c verbs.c + +MAN= \ +ibv_alloc_mw.3 ibv_alloc_pd.3 ibv_asyncwatch.1 ibv_attach_mcast.3 \ +ibv_cc_pingpong.1 ibv_create_ah.3 ibv_create_ah_from_wc.3 \ +ibv_create_comp_channel.3 ibv_create_cq.3 ibv_create_flow.3 \ +ibv_create_qp.3 ibv_create_qp_ex.3 ibv_create_srq.3 ibv_create_srq_ex.3 \ +ibv_create_xrc_rcv_qp.3 ibv_devices.1 ibv_devinfo.1 \ +ibv_event_type_str.3 ibv_exp_alloc_mkey_list_memory.3 ibv_exp_bind_mw.3 \ +ibv_exp_create_cq.3 ibv_exp_create_dct.3 ibv_exp_create_mr.3 \ +ibv_exp_create_qp.3 ibv_exp_create_res_domain.3 \ +ibv_exp_create_rwq_ind_table.3 ibv_exp_create_wq.3 \ +ibv_exp_dealloc_mkey_list_memory.3 ibv_exp_get_provider_func.3 \ +ibv_exp_modify_cq.3 ibv_exp_modify_qp.3 ibv_exp_modify_wq.3 \ +ibv_exp_poll_cq.3 ibv_exp_post_send.3 ibv_exp_post_task.3 \ +ibv_exp_prefetch_mr.3 ibv_exp_query_dct.3 ibv_exp_query_device.3 \ +ibv_exp_query_gid_attr.3 ibv_exp_query_intf.3 ibv_exp_query_mkey.3 \ +ibv_exp_query_values.3 ibv_exp_reg_mr.3 ibv_exp_reg_shared_mr.3 \ +ibv_exp_rereg_mr.3 ibv_fork_init.3 ibv_get_async_event.3 \ +ibv_get_cq_event.3 ibv_get_device_guid.3 ibv_get_device_list.3 \ +ibv_get_device_name.3 ibv_get_srq_num.3 ibv_intf.1 ibv_modify_qp.3 \ +ibv_modify_srq.3 ibv_modify_xrc_rcv_qp.3 ibv_open_device.3 \ +ibv_open_qp.3 ibv_open_xrc_domain.3 ibv_open_xrcd.3 ibv_poll_cq.3 \ +ibv_post_recv.3 ibv_post_send.3 ibv_post_srq_recv.3 ibv_query_device.3 \ +ibv_query_gid.3 ibv_query_pkey.3 ibv_query_port.3 ibv_query_qp.3 \ +ibv_query_srq.3 ibv_query_xrc_rcv_qp.3 ibv_rate_to_mbps.3 \ +ibv_rate_to_mult.3 ibv_rc_pingpong.1 ibv_reg_mr.3 ibv_reg_xrc_rcv_qp.3 \ +ibv_req_notify_cq.3 ibv_resize_cq.3 ibv_shared_mr.1 ibv_srq_pingpong.1 \ +ibv_task_pingpong.1 ibv_uc_pingpong.1 ibv_ud_pingpong.1 verbs.7 CFLAGS+= -DHAVE_CONFIG_H -DIBV_CONFIG_DIR=\"/etc/ibverbs/\" Index: contrib/ofed/usr.lib/libibverbs/config.h =================================================================== --- contrib/ofed/usr.lib/libibverbs/config.h +++ contrib/ofed/usr.lib/libibverbs/config.h @@ -1 +1,38 @@ -#include +#include "alloca.h" + +#define memalign(align, size) ({ \ + void *__ptr; \ + if (posix_memalign(&__ptr, (align), (size))) \ + __ptr = NULL; \ + __ptr; \ +}) + +#define HAVE_SYMVER_SUPPORT 1 + +#define NRESOLVE_NEIGH 1 + +/* Enable extensions on AIX 3, Interix. */ +#ifndef _ALL_SOURCE +# define _ALL_SOURCE 1 +#endif + +/* Enable GNU extensions on systems that have them. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif + +/* Enable threading extensions on Solaris. */ +#ifndef _POSIX_PTHREAD_SEMANTICS +# define _POSIX_PTHREAD_SEMANTICS 1 +#endif + +/* Enable extensions on HP NonStop. */ +#ifndef _TANDEM_SOURCE +# define _TANDEM_SOURCE 1 +#endif + +/* Enable general extensions on Solaris. */ +#ifndef __EXTENSIONS__ +# define __EXTENSIONS__ 1 +#endif +