Index: contrib/ofed/libmlx4/Makefile.am =================================================================== --- contrib/ofed/libmlx4/Makefile.am +++ contrib/ofed/libmlx4/Makefile.am @@ -1,12 +1,19 @@ -AM_CFLAGS = -g -Wall -D_GNU_SOURCE +AM_CFLAGS = -g -Wall -Werror -D_GNU_SOURCE mlx4_version_script = @MLX4_VERSION_SCRIPT@ MLX4_SOURCES = src/buf.c src/cq.c src/dbrec.c src/mlx4.c src/qp.c \ - src/srq.c src/verbs.c + src/srq.c src/verbs.c src/verbs_exp.c +noinst_HEADERS = src/bitmap.h src/doorbell.h src/list.h src/mlx4-abi.h src/mlx4_exp.h src/mlx4.h src/wqe.h if HAVE_IBV_DEVICE_LIBRARY_EXTENSION - lib_LTLIBRARIES = src/libmlx4.la + lib_LTLIBRARIES = +else + mlx4lib_LTLIBRARIES = +endif + +if HAVE_IBV_DEVICE_LIBRARY_EXTENSION + lib_LTLIBRARIES += src/libmlx4.la src_libmlx4_la_SOURCES = $(MLX4_SOURCES) src_libmlx4_la_LDFLAGS = -avoid-version -release @IBV_DEVICE_LIBRARY_EXTENSION@ \ $(mlx4_version_script) @@ -14,13 +21,14 @@ mlx4conf_DATA = mlx4.driver else mlx4libdir = $(libdir)/infiniband - mlx4lib_LTLIBRARIES = src/mlx4.la + mlx4lib_LTLIBRARIES += src/mlx4.la src_mlx4_la_SOURCES = $(MLX4_SOURCES) src_mlx4_la_LDFLAGS = -avoid-version -module $(mlx4_version_script) endif -EXTRA_DIST = src/doorbell.h src/mlx4.h src/mlx4-abi.h src/wqe.h \ - src/mlx4.map libmlx4.spec.in mlx4.driver +EXTRA_DIST = src/mlx4.map libmlx4.spec.in mlx4.driver +EXTRA_DIST += debian +EXTRA_DIST += autogen.sh dist-hook: libmlx4.spec cp libmlx4.spec $(distdir) Index: contrib/ofed/libmlx4/autogen.sh =================================================================== --- contrib/ofed/libmlx4/autogen.sh +++ contrib/ofed/libmlx4/autogen.sh @@ -1,4 +1,4 @@ -#! /bin/sh +#! /bin/sh -eE set -x aclocal -I config Index: contrib/ofed/libmlx4/configure.ac =================================================================== --- contrib/ofed/libmlx4/configure.ac +++ contrib/ofed/libmlx4/configure.ac @@ -1,12 +1,15 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT(libmlx4, 1.0, general@lists.openfabrics.org) +AC_INIT(libmlx4, 1.0.6mlnx1, linux-rdma@vger.kernel.org) AC_CONFIG_SRCDIR([src/mlx4.h]) AC_CONFIG_AUX_DIR(config) -AM_CONFIG_HEADER(config.h) -AM_INIT_AUTOMAKE(libmlx4, 1.0) -AM_PROG_LIBTOOL +AC_CONFIG_HEADER(config.h) +AM_INIT_AUTOMAKE([1.10 foreign tar-ustar silent-rules subdir-objects]) +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) + +AC_PROG_LIBTOOL +LT_INIT AC_ARG_WITH([valgrind], AC_HELP_STRING([--with-valgrind], @@ -21,6 +24,13 @@ fi fi +#--with-wqe-format +AC_ARG_WITH([wqe-format], + AC_HELP_STRING([--with-wqe-format], + [Enable wqe-format annotations (default NO)]), + AC_DEFINE([MLX4_WQE_FORMAT], 1, [Define to 1 to enable wqe-foramt annotations.]), +) + dnl Checks for programs AC_PROG_CC @@ -32,22 +42,19 @@ AC_CHECK_HEADER(infiniband/driver.h, [], AC_MSG_ERROR([ not found. libmlx4 requires libibverbs.])) AC_HEADER_STDC -AC_CHECK_HEADER(valgrind/memcheck.h, - [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1, - [Define to 1 if you have the header file.])], - [if test $want_valgrind = yes; then - AC_MSG_ERROR([Valgrind memcheck support requested, but not found.]) - fi]) + +if test x$want_valgrind = xyes; then + AC_CHECK_HEADER(valgrind/memcheck.h, + [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1, + [Define to 1 if you have the header file.])], + [if test $want_valgrind = yes; then + AC_MSG_ERROR([Valgrind memcheck support requested, but not found.]) + fi]) +fi dnl Checks for typedefs, structures, and compiler characteristics. AC_C_CONST AC_CHECK_SIZEOF(long) -AC_CHECK_MEMBER(struct ibv_context.more_ops, - [AC_DEFINE([HAVE_IBV_MORE_OPS], 1, [Define to 1 if more_ops is a member of ibv_context])],, - [#include ]) -AC_CHECK_MEMBER(struct ibv_more_ops.create_xrc_srq, - [AC_DEFINE([HAVE_IBV_XRC_OPS], 1, [Define to 1 if have xrc ops])],, - [#include ]) dnl Checks for library functions AC_CHECK_FUNC(ibv_read_sysfs_file, [], Index: contrib/ofed/libmlx4/debian/changelog =================================================================== --- contrib/ofed/libmlx4/debian/changelog +++ contrib/ofed/libmlx4/debian/changelog @@ -1,8 +1,201 @@ -libmlx4 (1.0-2) unstable; urgency=low +libmlx4 (1.0.6mlnx1-1) unstable; urgency=low - * Add debian/watch file + * libmlx4: Fix MR address change in rereg_mr + * libmlx4: revert the endianess fix for immediate data + * libmlx4: split post_send_one to qp types + * libmlx4: Add post_send_one to qp struct + * libmlx4: remove inl from basic set_data_seg functions + * libmlx4: Set data segment in one function + * libmlx4: set ctrl segment in one funtion + * libmlx4: use htonl when copy immediate data to WQE + * libmlx4: fix bug in bf_buf_size update + * libmlx4: Define set_data_seg as inline function + * libmlx4: reduce cache used by datapath + * libmlx4: optimize wq_overflow + * libmlx4: Add anothe DB ringing method + * libmlx4: Use x86_64 SSE2 instructions to improve bf_copy + * libmlx4: Add new DB ringing mode + * libmlx4: use all 8 BFs + * libmlx4: split ring_db function + * libmlx4: add door-bell ring function + * Modify call from ibv_exp_getenv to ibv_exp_cmd_getenv + * libmlx4: fix contiguous page registration + * Modify to use verbs specific getenv + * libmlx4: avoid creating AH with DLID 0 + * libmlx4: fixed resize cq overrun bug + * libmlx4.spec.in: Changed valgrind libs DESTDIR + * Added valgrind support + * fixed and added valgrind Macros + * Adding experimental dereg_mr support + * shared_mr: handle duplication from glob/procfs + * shared_mr: fine-tuned counter mode name + * fix 32 bit compile warning + * shared mr with counter name support + * libmlx4: allow user to specify the addr of contig pages. + * libmlx4: avoid using gettimeofday in mlx4_reg_shared_mr. + * libmlx4: init exp_mw_bind. + * libmlx4: added -Werror to Makefile + * ibmlx4: Use masked atomics only if max_atomic_arg defined + * wc_flags should be set even when using experimental verbs + * libmlx4: return errno on ibv_post_srq_recv + * libmlx4: Retry open shared mr file + * libmlx4: Add completion opcodes for masked atomic operations + * Verify hop_limit > 1 in create_ah + * libmlx4.spec.in: Support configure_options flag. + * configure: Update AM_INIT_AUTOMAKE to support new auto tools. + * Add MR re-registeration + * mlx4: Add support for timestamping when initiating context. + * libmlx4: Do not publish support for IBV_CALC_OP_MAXLOC + * Fix comp_mask handling in ibv_exp_query_values + * libmlx4: Simplify extended atomics API + * libmlx4: Fix wrong wqe pointer advance + * libmlx4: Add support for masked atomics + * Revert "libmlx4: Fix log function to avoid overflow" + * libmlx4: add ibv_exp_modify_qp to mlx4 + * libmlx4: Fix overflow on flag mask + * libmlx4: Fix log function to avoid overflow + * libmlx4: improve experimental interface + * A correct AH was free'd by mistake + * Align create_ah_ex and query_port_ex to upstream + * Change imm_data to ex.imm_data or ex.invalidate_rkey + * libmlx4: change wc_size from int to uint32_t. + * libmlx4: Print prefer_bf message only in trace mode. + * libmlx4: separate mlx4_post_send to EXP & NON EXP - -- Roland Dreier Wed, 12 Mar 2008 10:40:19 -0700 + -- Vladimir Sokolovsky Wed, 10 Dec 2014 10:53:10 +0200 + +libmlx4 (1.0.5mlnx1-1) unstable; urgency=low + + * resize_cq: fix possible endless loop scanning CQ + * User QP/SRQ in work completion + * libmlx4: Align verbs interface with upstream + * libmlx4: add ibv_exp_reg_mr experimental verb + * libmlx4: Change legacy extended verbs to experimental verbs + * libmlx4: Change legacy extended uverbs to experimental uverbs + * unmap hca_clock_page in mlx4_uninit_context + * Enable contigous pages for Control resources by default + * New experimental verbs for query_port + * Added htobe64 definition which is missing on SLES10 + * Fix QoS issues for UD QPs + * Allocate zoeroized memory for CQ + * libmlx4: Change sandy bridge work around algorithm + * libmlx4: add debian to EXTRA_DIST + * libmlx4: add support for "git review" command line gerrit tool + * libmlx4: Fix "make distcheck" + * Add allowed_wc_flags + * libmlx4: Fix valgrind errors. + * Raw IB QP fix + * libmlx4: Change inline receive interface + * Revert "move flow steering to experimental verbs" + * move flow steering to experimental verbs + * libmlx4: resolve segfault on ibv_xsrq_pingpong + * Raw Eth QP - prevent loopback on SRIOV + * libmlx4: remove struct ts and use direct field timestamp + * Fix compilation issue due to shifting bind_mw struct in ib_send_wr + * libmlx4: Add experimental inline receive + * Double check in order to prevent division by zero. + * Add a missing check for a value of a certain variable + * libmlx4 - qp: optimize single segment case around set_data_seg() + * libmlx4 - Inform GCC about hotspot functions so those can be optimized more aggressively. + * libmlx4 - Add branch prediction helpers to qp and cq data path functions. + * libmlx4 - Using unsigned indices allow GCC to generate a bit more efficient code. + * IP based addressing support + * Implementing verbs bind_mw (for binding type 1 memory windows) + * Adding support to post bind (type 2) memory windows + * Adding support to post invalidate messages + * Implementing verbs alloc_mw and dealloc_mw + * Adding work completions that are related to memory windows + * fix incorrect timestamp + * add a workaround for hw bug in hwclock wraparound + * extension verb: mlx4_query_values are reading hwclock + * extension verb: mlx4_query_device_ex + * extension verb: mlx4_create_cq_ex + * implement ibv_poll_cq_ex extension verb + * XRC - move warning to be under trace mode + * XRC - fix leak in legacy flow + * libmlx4 : Globaly avoid spinlocks for multithreaded apps + * Handle missing symbols in Xen server 6.1 + * libmlx4: Cache link layer's type in mlx4_context. Caching will allow us to avoid ibv_query_port calls and save time in ibv_create_ah. + * XRC - sync to latest upstream changes + * XRC issues + * libmlx4: XRC binary compat layer + + -- Vladimir Sokolovsky Sun, 23 Mar 2014 14:16:10 +0200 + +libmlx4 (1.0.4mlnx2-1) unstable; urgency=low + + * libmlx4: Add Cross-channel capability + * libmlx4: Add mlx4_post_task + * libmlx4: Add mlx4_query_device_ex + * libmlx4: Add mlx4_modify_cq + * libmlx4: Support Cross-channel capability in mlx4_create_qp_ex + * libmlx4: Add new fields and opcodes to support Cross-channel + * libmlx4: Remove legacy mverbs code + * libmlx4: Add support for XRC QPs + * libmlx4: contig pages over 4GB + * stall code to be run only on x86 + * Implement ibv_create_flow and ibv_destroy_flow + * Revert "Add support for ibv_attach_flow and ibv_detach_flow." + * libmlx4 fix compilation warnings + * Handle 0-length s/g list entries correctly + * libmlx4.spec.in: Fix %files macro + * configure: disable mverbs by default + * libmlx4: verbs extensions breaks MVERBS implementation + * shared_mr support on top of verbs extension + * libmlx4: Infra-structure changes to support verbs extensions + * fixed an issue with definition of container_of + * Revert "verbs extension mechanism based on Sean first patch" + + -- Vladimir Sokolovsky Mon, 7 Jan 2013 13:38:10 +0200 + +libmlx4 (1.0.4mlnx1-1) unstable; urgency=low + + * New Mellanox release. + + -- Vladimir Sokolovsky Mon, 7 Jan 2013 13:38:10 +0200 + +libmlx4 (1.0.4-1) unstable; urgency=low + + * New upstream release. + - IBoE multicast support. + * Update maintainer and remove DM-Upload-Allowed now that I'm a DD. + + -- Roland Dreier Wed, 28 Mar 2012 10:31:52 -0700 + +libmlx4 (1.0.3-1) unstable; urgency=low + + * New upstream release. + - Add ConnectX-3 support. + - Add IBoE support. + * Since we have plugin in /usr/lib/libibverbs, we need to depend on + libibverbs (>= 1.1.3). + + -- Roland Dreier Wed, 06 Jul 2011 23:54:24 -0700 + +libmlx4 (1.0.2-1) unstable; urgency=low + + * New upstream release. + - Fix potential problems running under Valgrind. + - Add support for resize CQ operation. + - Fix other minor bugs. + * Update maintainer and set DM-Upload-Allowed to yes. (Closes: #632108) + * Switch to dpkg-source 3.0 (quilt) format. + * Acknowledge NMU (Closes: #621664). + * Change build system from cdbs to debhelper 7. + * Use libibverbs 1.1.3 feature to move plugin to /usr/lib/libibverbs + to fix multiple problems with a not-exactly-shlib in /usr/lib. + * Add debian/watch file. + * Move -dbg package to section debug. + * Update to Standards-Version: 3.9.2. + + -- Roland Dreier Wed, 06 Jul 2011 13:32:18 -0700 + +libmlx4 (1.0-1.1) unstable; urgency=low + + * Non-maintainer upload. + * Don't ship .la files (Closes: #621664). + + -- Luk Claes Fri, 01 Jul 2011 19:09:59 +0200 libmlx4 (1.0-1) unstable; urgency=low Index: contrib/ofed/libmlx4/debian/compat =================================================================== --- contrib/ofed/libmlx4/debian/compat +++ contrib/ofed/libmlx4/debian/compat @@ -1 +1 @@ -5 +7 Index: contrib/ofed/libmlx4/debian/control =================================================================== --- contrib/ofed/libmlx4/debian/control +++ contrib/ofed/libmlx4/debian/control @@ -1,16 +1,16 @@ Source: libmlx4 Priority: extra -Maintainer: Roland Dreier -Build-Depends: @cdbs@, libibverbs-dev (>= 1.0) -Standards-Version: 3.7.3 +Maintainer: Roland Dreier +Build-Depends: debhelper (>= 7.0.50~), dpkg-dev (>= 1.13.19), libibverbs-dev (>= 1.1.3) +Standards-Version: 3.9.2 Section: libs Homepage: http://www.openfabrics.org/ Package: libmlx4-1 Section: libs Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends} -Description: A userspace driver for Mellanox ConnectX InfiniBand HCAs +Depends: ${shlibs:Depends}, ${misc:Depends}, libibverbs1 (>= 1.1.3) +Description: Userspace driver for Mellanox ConnectX InfiniBand HCAs libmlx4 is a device-specific driver for Mellanox ConnectX InfiniBand host channel adapters (HCAs) for the libibverbs library. This allows userspace processes to access Mellanox HCA hardware directly with @@ -32,7 +32,7 @@ directly to an application, which may be useful for debugging. Package: libmlx4-1-dbg -Section: libdevel +Section: debug Priority: extra Architecture: any Depends: ${misc:Depends}, libmlx4-1 (= ${binary:Version}) Index: contrib/ofed/libmlx4/debian/libmlx4-1.install =================================================================== --- contrib/ofed/libmlx4/debian/libmlx4-1.install +++ contrib/ofed/libmlx4/debian/libmlx4-1.install @@ -1,2 +1,2 @@ -usr/lib/libmlx4-rdmav2.so +usr/lib/libmlx4-rdmav2.so /usr/lib/libibverbs/ etc/libibverbs.d/mlx4.driver Index: contrib/ofed/libmlx4/debian/libmlx4-dev.install =================================================================== --- contrib/ofed/libmlx4/debian/libmlx4-dev.install +++ contrib/ofed/libmlx4/debian/libmlx4-dev.install @@ -1 +1 @@ -usr/lib/libmlx4.{a,la} +usr/lib/libmlx4.a Index: contrib/ofed/libmlx4/debian/rules =================================================================== --- contrib/ofed/libmlx4/debian/rules +++ contrib/ofed/libmlx4/debian/rules @@ -1,8 +1,10 @@ #!/usr/bin/make -f # -*- mode: makefile; coding: utf-8 -*- -DEB_DH_INSTALL_SOURCEDIR := debian/tmp -DEB_AUTO_UPDATE_LIBTOOL := post +%: + dh $@ -include /usr/share/cdbs/1/rules/debhelper.mk -include /usr/share/cdbs/1/class/autotools.mk +override_dh_strip: + dh_strip --dbg-package=libmlx4-1-dbg + +override_dh_makeshlibs: Index: contrib/ofed/libmlx4/libmlx4.spec.in =================================================================== --- contrib/ofed/libmlx4/libmlx4.spec.in +++ contrib/ofed/libmlx4/libmlx4.spec.in @@ -1,15 +1,27 @@ +%{!?_with_valgrind: %define _with_valgrind 0} +%{!?_disable_valgrind: %define _disable_valgrind 0} + +%if 0%{?rhel} == 6 +%if 0%{_disable_valgrind} == 0 +%define _with_valgrind 1 +%endif +%endif + Name: libmlx4 -Version: 1.0 -Release: 2%{?dist} +Version: 1.0.6mlnx1 +Release: 1%{?dist} Summary: Mellanox ConnectX InfiniBand HCA Userspace Driver Group: System Environment/Libraries License: GPLv2 or BSD Url: http://openfabrics.org/ -Source: http://openfabrics.org/downloads/mlx4/libmlx4-1.0.tar.gz +Source: http://openfabrics.org/downloads/mlx4/libmlx4-%{version}.tar.gz BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) -BuildRequires: libibverbs-devel >= 1.1-0.1.rc2 +BuildRequires: libibverbs-devel >= 1.1.6mlnx2 +%if %{_with_valgrind} +BuildRequires: valgrind-devel +%endif %description libmlx4 provides a device-specific userspace driver for Mellanox @@ -29,12 +41,24 @@ %setup -q -n %{name}-@VERSION@ %build -%configure +%if %{_with_valgrind} +%configure %{?configure_options} --libdir=%{_libdir}/mlnx_ofed/valgrind --with-valgrind +make %{?_smp_mflags} +make DESTDIR=$RPM_BUILD_DIR/%{name}-%{version}/valgrind install +rm -f $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind/*.*a +make clean +%endif + +%configure %{?configure_options} make %{?_smp_mflags} %install rm -rf $RPM_BUILD_ROOT make DESTDIR=%{buildroot} install +%if %{_with_valgrind} +mkdir -p %{buildroot}/%{_libdir}/mlnx_ofed +cp -a $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind %{buildroot}/%{_libdir}/mlnx_ofed +%endif # remove unpackaged files from the buildroot rm -f $RPM_BUILD_ROOT%{_libdir}/*.la $RPM_BUILD_ROOT%{_libdir}/libmlx4.so @@ -43,15 +67,34 @@ %files %defattr(-,root,root,-) -%{_libdir}/libmlx4-rdmav2.so +%{_libdir}/libmlx4*.so +%if %{_with_valgrind} +%{_libdir}/mlnx_ofed/valgrind/libmlx4*.so +%endif %{_sysconfdir}/libibverbs.d/mlx4.driver %doc AUTHORS COPYING README %files devel %defattr(-,root,root,-) -%{_libdir}/libmlx4.a +%{_libdir}/libmlx4*.a %changelog +* Mon Mar 28 2012 Roland Dreier - 1.0.4-1 +- New upstream release + +* Mon Mar 26 2012 Roland Dreier - 1.0.3-1 +- New upstream release + +* Wed Jul 6 2011 Roland Dreier - 1.0.2-1 +- New upstream release + +* Wed Jun 17 2009 Roland Dreier - 1.0.1-1 +- New upstream release +- Change openib.org URLs to openfabrics.org URLs + +* Wed Feb 25 2009 Fedora Release Engineering - 1.0-3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild + * Sun Jan 27 2008 Roland Dreier - 1.0-2 - Spec file cleanups, based on Fedora review: don't mark libmlx4.driver as a config file, since it is not user modifiable, Index: contrib/ofed/libmlx4/src/bitmap.h =================================================================== --- /dev/null +++ contrib/ofed/libmlx4/src/bitmap.h @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2000, 2011 Mellanox Technology Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef BITMAP_H +#define BITMAP_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef min +#define min(a, b) \ + ({ typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; }) +#endif + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define MLX4_SHM_ADDR (void *)(0x8000000000000000UL) +#define MLX4_SHMAT_FLAGS (SHM_RND) +#else +#define MLX4_SHM_ADDR (void *)(0x0UL) +#define MLX4_SHMAT_FLAGS (0) +#endif + +struct __dummy_h { unsigned long a[100]; }; +#define MLX4_ADDR (*(struct __dummy_h *) addr) +#define MLX4_CONST_ADDR (*(const struct __dummy_h *) addr) + +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define BITS_PER_BYTE 8 +#define BITS_PER_WORD (BITS_PER_BYTE * sizeof(uint32_t)) +#define BITS_TO_WORDS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(uint32_t)) + +#ifndef HPAGE_SIZE +#define HPAGE_SIZE (2UL*1024*1024) +#endif + +#define MLX4_SHM_LENGTH (HPAGE_SIZE) +#define MLX4_Q_CHUNK_SIZE 32768 +#define MLX4_SHM_NUM_REGION 64 + +struct mlx4_bitmap { + uint32_t last; + uint32_t top; + uint32_t max; + uint32_t avail; + uint32_t mask; + struct mlx4_spinlock lock; + uint32_t *table; +}; + +inline unsigned long mlx4_ffz(uint32_t word) +{ + return __builtin_ffs(~word) - 1; +} + +inline void mlx4_set_bit(unsigned int nr, uint32_t *addr) +{ + + addr[(nr / BITS_PER_WORD)] + |= (1 << (nr % BITS_PER_WORD)); + + +} + +inline void mlx4_clear_bit(unsigned int nr, uint32_t *addr) +{ + addr[(nr / BITS_PER_WORD)] + &= ~(1 << (nr % BITS_PER_WORD)); +} + +inline int mlx4_test_bit(unsigned int nr, const uint32_t *addr) +{ + return !!(addr[(nr / BITS_PER_WORD)] + & (1 << (nr % BITS_PER_WORD))); +} + +inline uint32_t mlx4_find_first_zero_bit(const uint32_t *addr, + uint32_t size) +{ + const uint32_t *p = addr; + uint32_t result = 0; + uint32_t tmp; + + while (size & ~(BITS_PER_WORD - 1)) { + tmp = *(p++); + if (~tmp) + goto found; + result += BITS_PER_WORD; + size -= BITS_PER_WORD; + } + if (!size) + return result; + + tmp = (*p) | (~0UL << size); + if (tmp == (uint32_t)~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found: + return result + mlx4_ffz(tmp); +} + +int mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap) +{ + uint32_t obj; + int ret; + + mlx4_spin_lock(&bitmap->lock); + + obj = mlx4_find_first_zero_bit(bitmap->table, bitmap->max); + if (obj < bitmap->max) { + mlx4_set_bit(obj, bitmap->table); + bitmap->last = (obj + 1); + if (bitmap->last == bitmap->max) + bitmap->last = 0; + obj |= bitmap->top; + ret = obj; + } else + ret = -1; + + if (ret != -1) + --bitmap->avail; + + mlx4_spin_unlock(&bitmap->lock); + + return ret; +} + +static inline uint32_t find_aligned_range(uint32_t *bitmap, + uint32_t start, uint32_t nbits, + int len, int alignment) +{ + uint32_t end, i; + +again: + start = align(start, alignment); + + while ((start < nbits) && mlx4_test_bit(start, bitmap)) + start += alignment; + + if (start >= nbits) + return -1; + + end = start + len; + if (end > nbits) + return -1; + + for (i = start + 1; i < end; i++) { + if (mlx4_test_bit(i, bitmap)) { + start = i + 1; + goto again; + } + } + + return start; +} + +static inline int mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, + int align) +{ + uint32_t obj; + int ret, i; + + if (cnt == 1 && align == 1) + return mlx4_bitmap_alloc(bitmap); + + if (cnt > bitmap->max) + return -1; + + mlx4_spin_lock(&bitmap->lock); + + obj = find_aligned_range(bitmap->table, bitmap->last, + bitmap->max, cnt, align); + if (obj >= bitmap->max) { + bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask; + obj = find_aligned_range(bitmap->table, 0, bitmap->max, + cnt, align); + } + + if (obj < bitmap->max) { + for (i = 0; i < cnt; i++) + mlx4_set_bit(obj + i, bitmap->table); + if (obj == bitmap->last) { + bitmap->last = (obj + cnt); + if (bitmap->last >= bitmap->max) + bitmap->last = 0; + } + obj |= bitmap->top; + ret = obj; + } else + ret = -1; + + if (ret != -1) + bitmap->avail -= cnt; + + mlx4_spin_unlock(&bitmap->lock); + + return obj; +} + +static inline void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, uint32_t obj, + int cnt) +{ + int i; + + obj &= bitmap->max - 1; + + mlx4_spin_lock(&bitmap->lock); + for (i = 0; i < cnt; i++) + mlx4_clear_bit(obj + i, bitmap->table); + bitmap->last = min(bitmap->last, obj); + bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask; + bitmap->avail += cnt; + mlx4_spin_unlock(&bitmap->lock); +} + +static inline int is_bitmap_empty(struct mlx4_bitmap *bitmap) +{ + int ret; + + mlx4_spin_lock(&bitmap->lock); + ret = (bitmap->avail == bitmap->max) ? 1 : 0; + mlx4_spin_unlock(&bitmap->lock); + + return ret; +} + +static inline int is_bitmap_avail(struct mlx4_bitmap *bitmap) +{ + int ret; + + mlx4_spin_lock(&bitmap->lock); + ret = (bitmap->avail > 0) ? 1 : 0; + mlx4_spin_unlock(&bitmap->lock); + + return ret; +} + +int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, uint32_t num, uint32_t mask) +{ + bitmap->last = 0; + bitmap->top = 0; + bitmap->max = bitmap->avail = num; + bitmap->mask = mask; + bitmap->avail = bitmap->max; + mlx4_spinlock_init(&bitmap->lock, !mlx4_single_threaded); + bitmap->table = malloc(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t)); + + if (!bitmap->table) + return -ENOMEM; + memset((void *)bitmap->table, 0, + (int)(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t))); + return 0; +} + +inline void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap) +{ + if (bitmap->table) + free(bitmap->table); +} + +static inline void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, uint32_t obj) +{ + mlx4_bitmap_free_range(bitmap, obj, 1); +} + +#endif Index: contrib/ofed/libmlx4/src/buf.c =================================================================== --- contrib/ofed/libmlx4/src/buf.c +++ contrib/ofed/libmlx4/src/buf.c @@ -36,9 +36,21 @@ #include #include +#include #include +#include +#include +#include #include "mlx4.h" +#include "bitmap.h" + +struct mlx4_hugetlb_mem { + int shmid; + char *shmaddr; + struct mlx4_bitmap bitmap; + struct list_head list; +}; #if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE)) @@ -59,13 +71,154 @@ #endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */ +void mlx4_hugetlb_mem_free(struct mlx4_hugetlb_mem *hmem) +{ + mlx4_bitmap_cleanup(&hmem->bitmap); + + if (shmdt((const void *)hmem->shmaddr) != 0) { + if (mlx4_trace) + perror("Detach shm failure"); + } + free(hmem); +} +static void mlx4_free_buf_huge_ex(struct mlx4_context *mctx, + struct mlx4_buf *buf, + int do_fork) +{ + struct mlx4_hugetlb_mem *hmem; + + if (do_fork) + ibv_dofork_range(buf->buf, buf->length); + + if (buf->hmem == NULL) { + if (mlx4_trace) + perror("No hugetlb mem"); + return; + } + + hmem = (struct mlx4_hugetlb_mem *) buf->hmem; + mlx4_spin_lock(&mctx->hugetlb_lock); + mlx4_bitmap_free_range(&hmem->bitmap, buf->base, + buf->length/MLX4_Q_CHUNK_SIZE); + + if (is_bitmap_empty(&hmem->bitmap)) { + list_del(&hmem->list); + mlx4_hugetlb_mem_free(hmem); + } + mlx4_spin_unlock(&mctx->hugetlb_lock); +} + +void mlx4_free_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf) +{ + mlx4_free_buf_huge_ex(mctx, buf, 1); +} + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 0 +#endif + +struct mlx4_hugetlb_mem *mxl4_hugetlb_mem_alloc(size_t size) +{ + struct mlx4_hugetlb_mem *hmem; + size_t shm_len; + + hmem = malloc(sizeof(*hmem)); + if (!hmem) + return NULL; + + shm_len = (size > MLX4_SHM_LENGTH) ? align(size, MLX4_SHM_LENGTH) : + MLX4_SHM_LENGTH; + hmem->shmid = shmget(IPC_PRIVATE, shm_len, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (hmem->shmid < 0) { + if (mlx4_trace) + perror("shmget"); + free(hmem); + return NULL; + } + + hmem->shmaddr = shmat(hmem->shmid, MLX4_SHM_ADDR, MLX4_SHMAT_FLAGS); + if (hmem->shmaddr == (char *)-1) { + if (mlx4_trace) + perror("Shared memory attach failure"); + shmctl(hmem->shmid, IPC_RMID, NULL); + free(hmem); + return NULL; + } + + if (mlx4_bitmap_init(&hmem->bitmap, shm_len/MLX4_Q_CHUNK_SIZE, + shm_len/MLX4_Q_CHUNK_SIZE - 1)) { + if (mlx4_trace) + perror("mlx4_bitmap_init"); + mlx4_hugetlb_mem_free(hmem); + return NULL; + } + + /* Marked to destroy when process detaches from shmget segment */ + shmctl(hmem->shmid, IPC_RMID, NULL); + + return hmem; +} + + +int mlx4_alloc_prefered_buf(struct mlx4_context *mctx, + struct mlx4_buf *buf, + size_t size, int page_size, + enum mlx4_alloc_type alloc_type, + const char *component) +{ + int ret = 1; + + buf->hmem = NULL; + /* Fallback mechanism is used below: + priority is: huge pages , contig pages, default allocation */ + if (alloc_type == MLX4_ALLOC_TYPE_HUGE || + alloc_type == MLX4_ALLOC_TYPE_PREFER_HUGE || + alloc_type == MLX4_ALLOC_TYPE_ALL) { + ret = mlx4_alloc_buf_huge(mctx, buf, + size, + page_size); + if (!ret) + return 0; + + /* Checking whether HUGE is forced */ + if (alloc_type == MLX4_ALLOC_TYPE_HUGE) + return -1; + if (mlx4_trace) + printf(PFX "Huge mode allocation has failed,fallback to %s mode\n", + MLX4_ALLOC_TYPE_ALL ? "contig" : "default"); + + } + + if (alloc_type == MLX4_ALLOC_TYPE_CONTIG || + alloc_type == MLX4_ALLOC_TYPE_PREFER_CONTIG || + alloc_type == MLX4_ALLOC_TYPE_ALL) { + ret = mlx4_alloc_buf_contig(mctx, buf, + size, + page_size, + component, NULL); + if (!ret) + return 0; + + /* Checking whether CONTIG is forced */ + if (alloc_type == MLX4_ALLOC_TYPE_CONTIG) + return -1; + if (mlx4_trace) + printf(PFX "Contig mode allocation has failed,fallback to default mode\n"); + } + + return mlx4_alloc_buf(buf, size, page_size); + +} + + int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size) { int ret; buf->length = align(size, page_size); buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (buf->buf == MAP_FAILED) return errno; @@ -78,6 +231,271 @@ void mlx4_free_buf(struct mlx4_buf *buf) { - ibv_dofork_range(buf->buf, buf->length); - munmap(buf->buf, buf->length); + if (buf->length) { + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); + } +} + +/* This function computes log2(v) rounded up. +* We don't want to have a dependency to libm which exposes ceil & log2 APIs. +* Code was written based on public domain code: + URL: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog. +*/ +static uint32_t mlx4_get_block_order(uint32_t v) +{ + static const uint32_t bits_arr[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + static const uint32_t shift_arr[] = {1, 2, 4, 8, 16}; + int i; + uint32_t input_val = v; + + register uint32_t r = 0;/* result of log2(v) will go here */ + for (i = 4; i >= 0; i--) { + + if (v & bits_arr[i]) { + v >>= shift_arr[i]; + r |= shift_arr[i]; + } + } + /* Rounding up if required */ + r += !!(input_val & ((1 << r) - 1)); + + return r; +} + + +static int mlx4_finalize_contiguous_alloc(struct mlx4_buf *buf, + void *addr, + size_t length) +{ + if (ibv_dontfork_range(addr, length)) { + munmap(addr, length); + return 1; + } + + /* We hook addr & length also internally for further + use via dreg_mr. On ibv_mr returned to user length or address may + be different than the allocated length or address as of alignment + issues. + */ + buf->buf = addr; + buf->length = length; + return 0; + +} + + +void mlx4_get_alloc_type(struct ibv_context *context, const char *component, + enum mlx4_alloc_type *alloc_type, + enum mlx4_alloc_type default_alloc_type) + +{ + char env_value[VERBS_MAX_ENV_VAL]; + char name_buff[128]; + + sprintf(name_buff, "%s_ALLOC_TYPE", component); + + /* First set defaults */ + *alloc_type = default_alloc_type; + + if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) { + if (!strcasecmp(env_value, "ANON")) + *alloc_type = MLX4_ALLOC_TYPE_ANON; + else if (!strcasecmp(env_value, "HUGE")) + *alloc_type = MLX4_ALLOC_TYPE_HUGE; + else if (!strcasecmp(env_value, "CONTIG")) + *alloc_type = MLX4_ALLOC_TYPE_CONTIG; + else if (!strcasecmp(env_value, "PREFER_CONTIG")) + *alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG; + else if (!strcasecmp(env_value, "PREFER_HUGE")) + *alloc_type = MLX4_ALLOC_TYPE_PREFER_HUGE; + else if (!strcasecmp(env_value, "ALL")) + *alloc_type = MLX4_ALLOC_TYPE_ALL; + } + + return; +} + + +static void mlx4_alloc_get_env_info(struct ibv_context *context, + int *max_log2_contig_block_size, + int *min_log2_contig_block_size, + const char *component) + +{ + char env_value[VERBS_MAX_ENV_VAL]; + int value; + char name_buff[128]; + + /* First set defaults */ + *max_log2_contig_block_size = MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE; + *min_log2_contig_block_size = MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE; + + sprintf(name_buff, "%s_MAX_LOG2_CONTIG_BSIZE", component); + if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) { + value = atoi(env_value); + if (value <= MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE && + value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE) + *max_log2_contig_block_size = value; + else + fprintf(stderr, + "Invalid value %d for %s\n", + value, name_buff); + } + sprintf(name_buff, "%s_MIN_LOG2_CONTIG_BSIZE", component); + if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) { + value = atoi(env_value); + if (value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE && + value <= *max_log2_contig_block_size) + *min_log2_contig_block_size = value; + else + fprintf(stderr, + "Invalid value %d for %s\n", + value, name_buff); + } + return; } + + + +int mlx4_alloc_buf_contig(struct mlx4_context *mctx, + struct mlx4_buf *buf, size_t size, + int page_size, + const char *component, void *req_addr) +{ + void *addr = NULL; + int block_size_exp; + int max_log2_contig_block_size; + int min_log2_contig_block_size; + int mmap_flags = MAP_SHARED; + void *act_addr = NULL; + size_t act_size = size; + + struct ibv_context *context = &(mctx->ibv_ctx); + + mlx4_alloc_get_env_info(&mctx->ibv_ctx, + &max_log2_contig_block_size, + &min_log2_contig_block_size, + component); + + /* Checking that we don't pass max block size */ + if (size >= (1 << max_log2_contig_block_size)) + block_size_exp = max_log2_contig_block_size; + else + block_size_exp = mlx4_get_block_order(size); + + if (req_addr) { + act_addr = (void *)((uintptr_t)req_addr & ~((uintptr_t)page_size - 1)); + act_size += (size_t)((uintptr_t)req_addr - (uintptr_t)act_addr); + mmap_flags |= MAP_FIXED; + } + + do { + /* The second parameter holds the total required length for + this contiguous allocation aligned to page size. + When calling mmap the last offset parameter + should be a multiple of the page size and holds: + 1) Indication that we are in that mode of + allocation contiguous memory (value #2) + 2) The required size of each block. + To enable future actions on mmap we + use the last 3 bits of the offset parameter + as the command identifier. + */ + addr = mmap(act_addr, act_size, + PROT_WRITE | PROT_READ, mmap_flags, + context->cmd_fd, + page_size * + (MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD + + (block_size_exp << MLX4_MMAP_CMD_BITS))); + + /* On a failure MAP_FAILED (that is, (void *) -1) is returned*/ + if (addr != MAP_FAILED) + break; + + /* We failed - set addr to NULL and checks whether + a retry is relevant. + * If kernel doesn't support this command as of + compatibility issues we'll also get EINVAL. + */ + addr = NULL; + if (errno == EINVAL) + break; + + /* Retring asking for less contiguous pages per block */ + block_size_exp -= 1; + } while (block_size_exp >= min_log2_contig_block_size); + + if (!addr) + return 1; + + /* All was ok we'll make final steps to have this addr ready*/ + return mlx4_finalize_contiguous_alloc(buf, addr, act_size); +} + +int mlx4_alloc_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf, + size_t size, int page_size) +{ + struct mlx4_hugetlb_mem *hmem, *tmp_hmem; + int found = 0; + int ret = 0; + LIST_HEAD(slist); + + buf->length = align(size, MLX4_Q_CHUNK_SIZE); + + mlx4_spin_lock(&mctx->hugetlb_lock); + list_for_each_entry_safe(hmem, tmp_hmem, &mctx->hugetlb_list, list) { + if (is_bitmap_avail(&hmem->bitmap)) { + buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap, + buf->length/MLX4_Q_CHUNK_SIZE, 1); + if (buf->base == -1) + continue; + else { + buf->hmem = (void *)hmem; + found = 1; + break; + } + } + } + mlx4_spin_unlock(&mctx->hugetlb_lock); + + if (!found) { + int avail; + + hmem = mxl4_hugetlb_mem_alloc(buf->length); + if (hmem == NULL) + return -1; + + buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap, + buf->length/MLX4_Q_CHUNK_SIZE, 1); + if (buf->base == -1) { + if (mlx4_trace) + perror("mlx4_bitmap_alloc_range"); + mlx4_hugetlb_mem_free(hmem); + return -1; + } + + buf->hmem = (void *)hmem; + + avail = is_bitmap_avail(&hmem->bitmap); + mlx4_spin_lock(&mctx->hugetlb_lock); + if (avail) + list_add(&hmem->list, &mctx->hugetlb_list); + else + list_add_tail(&hmem->list, &mctx->hugetlb_list); + mlx4_spin_unlock(&mctx->hugetlb_lock); + } + + buf->buf = hmem->shmaddr + (buf->base * MLX4_Q_CHUNK_SIZE); + + ret = ibv_dontfork_range(buf->buf, buf->length); + if (ret) { + mlx4_free_buf_huge_ex(mctx, buf, 0); + buf->hmem = NULL; + if (mlx4_trace) + perror("ibv_dontfork_range"); + } + + return ret; +} + Index: contrib/ofed/libmlx4/src/cq.c =================================================================== --- contrib/ofed/libmlx4/src/cq.c +++ contrib/ofed/libmlx4/src/cq.c @@ -47,6 +47,8 @@ #include "mlx4.h" #include "doorbell.h" +int mlx4_stall_num_loop = 300; + enum { MLX4_CQ_DOORBELL = 0x20 }; @@ -61,8 +63,18 @@ #define MLX4_CQ_DB_REQ_NOT (2 << 24) enum { + MLX4_CQE_L2_TUNNEL_IPV4 = 1 << 25, + MLX4_CQE_L2_TUNNEL_L4_CSUM = 1 << 26, + MLX4_CQE_L2_TUNNEL = 1 << 27, + MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29, + MLX4_CQE_L2_TUNNEL_IPOK = 1 << 31, + MLX4_CQE_QPN_MASK = 0xffffff, +}; + +enum { MLX4_CQE_OWNER_MASK = 0x80, MLX4_CQE_IS_SEND_MASK = 0x40, + MLX4_CQE_INL_SCATTER_MASK = 0x20, MLX4_CQE_OPCODE_MASK = 0x1f }; @@ -82,23 +94,50 @@ MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22, }; +enum { + MLX4_CQE_STATUS_L4_CSUM = 1 << 2, + MLX4_CQE_STATUS_IPV4 = 1 << 6, + MLX4_CQE_STATUS_IPV4F = 1 << 7, + MLX4_CQE_STATUS_IPV6 = 1 << 8, + MLX4_CQE_STATUS_IPV4OPT = 1 << 9, + MLX4_CQE_STATUS_TCP = 1 << 10, + MLX4_CQE_STATUS_UDP = 1 << 11, + MLX4_CQE_STATUS_IPOK = 1 << 12 +}; + + struct mlx4_cqe { - uint32_t my_qpn; + uint32_t vlan_my_qpn; uint32_t immed_rss_invalid; uint32_t g_mlpath_rqpn; - uint8_t sl; - uint8_t reserved1; - uint16_t rlid; - uint32_t reserved2; + union { + struct { + union { + struct { + uint16_t sl_vid; + uint16_t rlid; + }; + uint32_t timestamp_16_47; + }; + uint16_t status; + uint8_t reserved2; + uint8_t badfcs_enc; + }; + struct { + uint16_t reserved4; + uint8_t smac[6]; + }; + }; uint32_t byte_cnt; uint16_t wqe_index; uint16_t checksum; - uint8_t reserved3[3]; + uint8_t reserved5[1]; + uint16_t timestamp_0_15; uint8_t owner_sr_opcode; -}; +} __attribute__((packed)); struct mlx4_err_cqe { - uint32_t my_qpn; + uint32_t vlan_my_qpn; uint32_t reserved1[5]; uint16_t wqe_index; uint8_t vendor_err; @@ -118,7 +157,7 @@ struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe; return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ - !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : tcqe; + !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe; } static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq) @@ -126,18 +165,13 @@ return get_sw_cqe(cq, cq->cons_index); } -static void update_cons_index(struct mlx4_cq *cq) -{ - *cq->set_ci_db = htonl(cq->cons_index & 0xffffff); -} - static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc) { if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) printf(PFX "local QP operation err " "(QPN %06x, WQE index %x, vendor syndrome %02x, " "opcode = %02x)\n", - htonl(cqe->my_qpn), htonl(cqe->wqe_index), + htonl(cqe->vlan_my_qpn), htonl(cqe->wqe_index), cqe->vendor_err, cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); @@ -191,22 +225,34 @@ static int mlx4_poll_one(struct mlx4_cq *cq, struct mlx4_qp **cur_qp, - struct ibv_wc *wc) + struct ibv_exp_wc *wc, + uint32_t wc_size, int is_exp) { struct mlx4_wq *wq; struct mlx4_cqe *cqe; - struct mlx4_srq *srq = NULL; + struct mlx4_srq *srq; uint32_t qpn; - uint32_t srqn; uint32_t g_mlpath_rqpn; uint16_t wqe_index; int is_error; int is_send; - + int size; + int left; + int list_len; + int i; + struct mlx4_inlr_rbuff *rbuffs; + uint8_t *sbuff; + int timestamp_en = !!(cq->creation_flags & + IBV_EXP_CQ_TIMESTAMP); + uint64_t exp_wc_flags = 0; + uint64_t wc_flags = 0; cqe = next_cqe_sw(cq); if (!cqe) return CQ_EMPTY; + if (cq->cqe_size == 64) + ++cqe; + ++cq->cons_index; VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); @@ -217,36 +263,44 @@ */ rmb(); - qpn = ntohl(cqe->my_qpn); + qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; + wc->qp_num = qpn; is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + + /* include checksum as work around for calc opcode */ is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == - MLX4_CQE_OPCODE_ERROR; + MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff); - if (qpn & MLX4_XRC_QPN_BIT && !is_send) { - srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff; + if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) { /* - * We do not have to take the XRC SRQ table lock here, - * because CQs will be locked while XRC SRQs are removed + * We do not have to take the XSRQ table lock here, + * because CQs will be locked while SRQs are removed * from the table. */ - srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn); + *cur_qp = NULL; + srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table, + ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK); if (!srq) return CQ_POLL_ERR; - } else if (!*cur_qp || (qpn & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) { - /* - * We do not have to take the QP table lock here, - * because CQs will be locked while QPs are removed - * from the table. - */ - *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), - qpn & 0xffffff); - if (!*cur_qp) - return CQ_POLL_ERR; + } else { + if (unlikely(!*cur_qp || (qpn != (*cur_qp)->verbs_qp.qp.qp_num))) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn); + if (unlikely(!*cur_qp)) + return CQ_POLL_ERR; + } + if (is_exp) { + wc->qp = &((*cur_qp)->verbs_qp.qp); + exp_wc_flags |= IBV_EXP_WC_QP; + } + srq = ((*cur_qp)->verbs_qp.qp.srq) ? to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL; } - wc->qp_num = qpn & 0xffffff; - if (is_send) { wq = &(*cur_qp)->sq; wqe_index = ntohs(cqe->wqe_index); @@ -257,112 +311,267 @@ wqe_index = htons(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_index]; mlx4_free_srq_wqe(srq, wqe_index); - } else if ((*cur_qp)->ibv_qp.srq) { - srq = to_msrq((*cur_qp)->ibv_qp.srq); - wqe_index = htons(cqe->wqe_index); - wc->wr_id = srq->wrid[wqe_index]; - mlx4_free_srq_wqe(srq, wqe_index); + if (is_exp) { + wc->srq = &(srq->verbs_srq.srq); + exp_wc_flags |= IBV_EXP_WC_SRQ; + } } else { wq = &(*cur_qp)->rq; - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + wqe_index = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[wqe_index]; ++wq->tail; } - if (is_error) { - mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + if (unlikely(is_error)) { + mlx4_handle_error_cqe((struct mlx4_err_cqe *)cqe, + (struct ibv_wc *)wc); return CQ_OK; } wc->status = IBV_WC_SUCCESS; + if (timestamp_en && offsetof(struct ibv_exp_wc, timestamp) < wc_size) { + /* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is + * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't + * supported */ + if (cq->creation_flags & + IBV_EXP_CQ_TIMESTAMP_TO_SYS_TIME) + wc->timestamp = 0; + else { + wc->timestamp = + (uint64_t)(ntohl(cqe->timestamp_16_47) + + !cqe->timestamp_0_15) << 16 + | (uint64_t)ntohs(cqe->timestamp_0_15); + exp_wc_flags |= IBV_EXP_WC_WITH_TIMESTAMP; + } + } + if (is_send) { - wc->wc_flags = 0; switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_CALC_RDMA_WRITE_IMM: case MLX4_OPCODE_RDMA_WRITE_IMM: - wc->wc_flags |= IBV_WC_WITH_IMM; + wc_flags |= IBV_WC_WITH_IMM; case MLX4_OPCODE_RDMA_WRITE: - wc->opcode = IBV_WC_RDMA_WRITE; + wc->exp_opcode = IBV_EXP_WC_RDMA_WRITE; break; case MLX4_OPCODE_SEND_IMM: - wc->wc_flags |= IBV_WC_WITH_IMM; + wc_flags |= IBV_WC_WITH_IMM; case MLX4_OPCODE_SEND: - wc->opcode = IBV_WC_SEND; + wc->exp_opcode = IBV_EXP_WC_SEND; break; case MLX4_OPCODE_RDMA_READ: - wc->opcode = IBV_WC_RDMA_READ; + wc->exp_opcode = IBV_EXP_WC_RDMA_READ; wc->byte_len = ntohl(cqe->byte_cnt); break; case MLX4_OPCODE_ATOMIC_CS: - wc->opcode = IBV_WC_COMP_SWAP; + wc->exp_opcode = IBV_EXP_WC_COMP_SWAP; wc->byte_len = 8; break; case MLX4_OPCODE_ATOMIC_FA: - wc->opcode = IBV_WC_FETCH_ADD; + wc->exp_opcode = IBV_EXP_WC_FETCH_ADD; wc->byte_len = 8; break; + case MLX4_OPCODE_ATOMIC_MASK_CS: + wc->exp_opcode = IBV_EXP_WC_MASKED_COMP_SWAP; + break; + case MLX4_OPCODE_ATOMIC_MASK_FA: + wc->exp_opcode = IBV_EXP_WC_MASKED_FETCH_ADD; + break; + case MLX4_OPCODE_LOCAL_INVAL: + if (unlikely(!is_exp)) + return CQ_POLL_ERR; + wc->exp_opcode = IBV_EXP_WC_LOCAL_INV; + break; + case MLX4_OPCODE_SEND_INVAL: + wc->exp_opcode = IBV_EXP_WC_SEND; + break; case MLX4_OPCODE_BIND_MW: - wc->opcode = IBV_WC_BIND_MW; + wc->exp_opcode = IBV_EXP_WC_BIND_MW; break; default: /* assume it's a send completion */ - wc->opcode = IBV_WC_SEND; + wc->exp_opcode = IBV_EXP_WC_SEND; break; } } else { wc->byte_len = ntohl(cqe->byte_cnt); + if ((*cur_qp) && (*cur_qp)->max_inlr_sg && + (cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) { + rbuffs = (*cur_qp)->inlr_buff.buff[wqe_index].sg_list; + list_len = (*cur_qp)->inlr_buff.buff[wqe_index].list_len; + sbuff = mlx4_get_recv_wqe((*cur_qp), wqe_index); + left = wc->byte_len; + for (i = 0; (i < list_len) && left; i++) { + size = min(rbuffs->rlen, left); + memcpy(rbuffs->rbuff, sbuff, size); + left -= size; + rbuffs++; + sbuff += size; + } + if (left) { + wc->status = IBV_WC_LOC_LEN_ERR; + return CQ_OK; + } + } switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: - wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; - wc->wc_flags = IBV_WC_WITH_IMM; + wc->exp_opcode = IBV_EXP_WC_RECV_RDMA_WITH_IMM; + wc_flags = IBV_WC_WITH_IMM; wc->imm_data = cqe->immed_rss_invalid; break; + case MLX4_RECV_OPCODE_SEND_INVAL: + if (unlikely(!is_exp)) + return CQ_POLL_ERR; + wc->exp_opcode = IBV_EXP_WC_RECV; + exp_wc_flags |= IBV_EXP_WC_WITH_INV; + wc->imm_data = ntohl(cqe->immed_rss_invalid); + break; case MLX4_RECV_OPCODE_SEND: - wc->opcode = IBV_WC_RECV; - wc->wc_flags = 0; + wc->exp_opcode = IBV_EXP_WC_RECV; + wc_flags = 0; break; case MLX4_RECV_OPCODE_SEND_IMM: - wc->opcode = IBV_WC_RECV; - wc->wc_flags = IBV_WC_WITH_IMM; + wc->exp_opcode = IBV_EXP_WC_RECV; + wc_flags = IBV_WC_WITH_IMM; wc->imm_data = cqe->immed_rss_invalid; break; } - wc->slid = ntohs(cqe->rlid); - wc->sl = cqe->sl >> 4; + if (!timestamp_en) { + exp_wc_flags |= IBV_EXP_WC_WITH_SLID; + wc->slid = ntohs(cqe->rlid); + } g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn); wc->src_qp = g_mlpath_rqpn & 0xffffff; wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; - wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0; + wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0; wc->pkey_index = ntohl(cqe->immed_rss_invalid) & 0x7f; + /* When working with xrc srqs, don't have qp to check link layer. + * Using IB SL, should consider Roce. (TBD) + */ + /* sl is invalid when timestamp is used */ + if (!timestamp_en) { + if ((*cur_qp) && (*cur_qp)->link_layer == + IBV_LINK_LAYER_ETHERNET) + wc->sl = ntohs(cqe->sl_vid) >> 13; + else + wc->sl = ntohs(cqe->sl_vid) >> 12; + exp_wc_flags |= IBV_EXP_WC_WITH_SL; + } + if (is_exp) { + if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & + MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP)) { + /* Only ConnectX-3 Pro reports checksum for now) */ + exp_wc_flags |= + MLX4_TRANSPOSE(cqe->badfcs_enc, + MLX4_CQE_STATUS_L4_CSUM, + (uint64_t)IBV_EXP_WC_RX_TCP_UDP_CSUM_OK) | + mlx4_transpose_uint16_t(cqe->status, + htons(MLX4_CQE_STATUS_IPOK), + (uint64_t)IBV_EXP_WC_RX_IP_CSUM_OK) | + mlx4_transpose_uint16_t(cqe->status, + htons(MLX4_CQE_STATUS_IPV4), + (uint64_t)IBV_EXP_WC_RX_IPV4_PACKET) | + mlx4_transpose_uint16_t(cqe->status, + htons(MLX4_CQE_STATUS_IPV6), + (uint64_t)IBV_EXP_WC_RX_IPV6_PACKET) | + mlx4_transpose_uint32_t(cqe->vlan_my_qpn, + htonl(MLX4_CQE_L2_TUNNEL), + (uint64_t)IBV_EXP_WC_RX_TUNNEL_PACKET) | + mlx4_transpose_uint32_t(cqe->vlan_my_qpn, + htonl(MLX4_CQE_L2_TUNNEL_IPOK), + (uint64_t)IBV_EXP_WC_RX_OUTER_IP_CSUM_OK) | + mlx4_transpose_uint32_t(cqe->vlan_my_qpn, + htonl(MLX4_CQE_L2_TUNNEL_L4_CSUM), + (uint64_t)IBV_EXP_WC_RX_OUTER_TCP_UDP_CSUM_OK) | + mlx4_transpose_uint32_t(cqe->vlan_my_qpn, + htonl(MLX4_CQE_L2_TUNNEL_IPV4), + (uint64_t)IBV_EXP_WC_RX_OUTER_IPV4_PACKET); + exp_wc_flags |= + MLX4_TRANSPOSE(~exp_wc_flags, + IBV_EXP_WC_RX_OUTER_IPV4_PACKET, + IBV_EXP_WC_RX_OUTER_IPV6_PACKET); + } + } } + if (is_exp) + wc->exp_wc_flags = exp_wc_flags | (uint64_t)wc_flags; + + ((struct ibv_wc *)wc)->wc_flags = wc_flags; + return CQ_OK; } -int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +#if defined(__amd64__) || defined(__i386__) +static inline unsigned long get_cycles() +{ + unsigned low, high; + unsigned long long val; + asm volatile ("rdtsc" : "=a" (low), "=d" (high)); + val = high; + val = (val << 32) | low; + return val; +} +#else +static inline unsigned long get_cycles() +{ + return 0; +} +#endif + +static void mlx4_stall_poll_cq() +{ + int i; + + for (i = 0; i < mlx4_stall_num_loop; i++) + (void)get_cycles(); +} + +int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_exp_wc *wc, + uint32_t wc_size, int is_exp) { struct mlx4_cq *cq = to_mcq(ibcq); struct mlx4_qp *qp = NULL; int npolled; int err = CQ_OK; - pthread_spin_lock(&cq->lock); - + if (unlikely(cq->stall_next_poll)) { + cq->stall_next_poll = 0; + mlx4_stall_poll_cq(); + } + mlx4_lock(&cq->lock); + for (npolled = 0; npolled < ne; ++npolled) { - err = mlx4_poll_one(cq, &qp, wc + npolled); - if (err != CQ_OK) + err = mlx4_poll_one(cq, &qp, ((void *)wc) + npolled * wc_size, + wc_size, is_exp); + if (unlikely(err != CQ_OK)) break; } - if (npolled) - update_cons_index(cq); + if (likely(npolled || err == CQ_POLL_ERR)) + mlx4_update_cons_index(cq); - pthread_spin_unlock(&cq->lock); + mlx4_unlock(&cq->lock); + if (unlikely(cq->stall_enable && err == CQ_EMPTY)) + cq->stall_next_poll = 1; + return err == CQ_POLL_ERR ? err : npolled; } +int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries, + struct ibv_exp_wc *wc, uint32_t wc_size) +{ + return mlx4_poll_cq(ibcq, num_entries, wc, wc_size, 1); +} + +int mlx4_poll_ibv_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + return mlx4_poll_cq(ibcq, ne, (struct ibv_exp_wc *)wc, sizeof(*wc), 0); +} + int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) { struct mlx4_cq *cq = to_mcq(ibvcq); @@ -402,12 +611,10 @@ uint32_t prod_index; uint8_t owner_bit; int nfreed = 0; - int is_xrc_srq = 0; int cqe_inc = cq->cqe_size == 64 ? 1 : 0; - if (srq && srq->ibv_srq.xrc_cq) - is_xrc_srq = 1; - + if (cq->last_qp && cq->last_qp->verbs_qp.qp.qp_num == qpn) + cq->last_qp = NULL; /* * First we need to find the current producer index, so we * know where to start cleaning from. It doesn't matter if HW @@ -426,12 +633,12 @@ while ((int) --prod_index - (int) cq->cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); cqe += cqe_inc; - if (is_xrc_srq && - (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) && + if (srq && srq->ext_srq && + ntohl(cqe->g_mlpath_rqpn & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) { mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index)); ++nfreed; - } else if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) { + } else if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index)); ++nfreed; @@ -452,22 +659,22 @@ * updating consumer index. */ wmb(); - update_cons_index(cq); + mlx4_update_cons_index(cq); } } void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) { - pthread_spin_lock(&cq->lock); + mlx4_lock(&cq->lock); __mlx4_cq_clean(cq, qpn, srq); - pthread_spin_unlock(&cq->lock); + mlx4_unlock(&cq->lock); } int mlx4_get_outstanding_cqes(struct mlx4_cq *cq) { uint32_t i; - for (i = cq->cons_index; get_sw_cqe(cq, (i & cq->ibv_cq.cqe)); ++i) + for (i = cq->cons_index; get_sw_cqe(cq, i); ++i) ; return i - cq->cons_index; @@ -496,13 +703,491 @@ ++cq->cons_index; } -int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, +int mlx4_alloc_cq_buf(struct mlx4_context *mctx, struct mlx4_buf *buf, int nent, int entry_size) { - if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size), - dev->page_size)) + struct mlx4_device *dev = to_mdev(mctx->ibv_ctx.device); + int ret; + enum mlx4_alloc_type alloc_type; + enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG; + + if (mlx4_use_huge(&mctx->ibv_ctx, "HUGE_CQ")) + default_alloc_type = MLX4_ALLOC_TYPE_HUGE; + + mlx4_get_alloc_type(&mctx->ibv_ctx, MLX4_CQ_PREFIX, &alloc_type, + default_alloc_type); + + ret = mlx4_alloc_prefered_buf(mctx, buf, + align(nent * entry_size, dev->page_size), + dev->page_size, + alloc_type, + MLX4_CQ_PREFIX); + + if (ret) return -1; + memset(buf->buf, 0, nent * entry_size); return 0; } + +/* + * poll family functions + */ +static inline int drain_rx(struct mlx4_cq *cq, struct mlx4_cqe *cqe, + struct mlx4_qp *cur_qp, uint8_t *buf, uint32_t *inl) __attribute__((always_inline)); +static inline int drain_rx(struct mlx4_cq *cq, struct mlx4_cqe *cqe, + struct mlx4_qp *cur_qp, uint8_t *buf, uint32_t *inl) +{ + struct mlx4_srq *srq; + uint32_t qpn; + uint16_t wqe_index; + + qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; + + + if (unlikely(!cur_qp || (qpn != cur_qp->verbs_qp.qp.qp_num))) { + if (unlikely(qpn & MLX4_XRC_QPN_BIT)) { + /* + * We do not have to take the XSRQ table lock here, + * because CQs will be locked while SRQs are removed + * from the table. + */ + cur_qp = NULL; + srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table, + ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK); + if (!srq) + return CQ_POLL_ERR; + + /* Advance indexes only on success */ + wqe_index = htons(cqe->wqe_index); + mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index); + + ++cq->cons_index; + + return CQ_OK; + } + + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn); + if (unlikely(!cur_qp)) + return CQ_POLL_ERR; + cq->last_qp = cur_qp; + } + + if (!cur_qp->max_inlr_sg) { + /* Advance indexes only on success to enable getting + * the full CQE with ibv_poll_cq in case of failure + */ + if (unlikely(cur_qp->verbs_qp.qp.srq)) { + wqe_index = htons(cqe->wqe_index); + mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index); + } else { + ++cur_qp->rq.tail; + } + ++cq->cons_index; + + return CQ_OK; + } + + /* We get here only when cur_qp->max_inlr_sg != 0 */ + if (likely(cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) { + int size; + int left; + int list_len; + int i; + struct mlx4_inlr_rbuff *rbuffs; + uint8_t *sbuff; + int is_error; + + /* include checksum as work around for calc opcode */ + is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff); + if (unlikely(is_error)) + return CQ_POLL_ERR; + + wqe_index = cur_qp->rq.tail & (cur_qp->rq.wqe_cnt - 1); + sbuff = mlx4_get_recv_wqe(cur_qp, wqe_index); + left = ntohl(cqe->byte_cnt); + if (likely(buf)) { + *inl = 1; + memcpy(buf, sbuff, left); + } else { + rbuffs = cur_qp->inlr_buff.buff[wqe_index].sg_list; + list_len = cur_qp->inlr_buff.buff[wqe_index].list_len; + for (i = 0; (i < list_len) && left; i++) { + size = min(rbuffs->rlen, left); + memcpy(rbuffs->rbuff, sbuff, size); + left -= size; + rbuffs++; + sbuff += size; + } + if (left) + return CQ_POLL_ERR; + } + } + + /* Advance indexes only on success to enable getting + * the full CQE with ibv_poll_cq in case of failure + */ + ++cur_qp->rq.tail; + + ++cq->cons_index; + + return CQ_OK; +} + +static inline int update_sq_tail(struct mlx4_cq *cq, struct mlx4_cqe *cqe, + struct mlx4_qp *cur_qp) __attribute__((always_inline)); +static inline int update_sq_tail(struct mlx4_cq *cq, struct mlx4_cqe *cqe, + struct mlx4_qp *cur_qp) +{ + uint32_t qpn; + + qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; + if (unlikely(!cur_qp || (qpn != cur_qp->verbs_qp.qp.qp_num))) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn); + if (unlikely(!cur_qp)) + return CQ_POLL_ERR; + cq->last_qp = cur_qp; + } + + /* Advance indexes only on success */ + cur_qp->sq.tail += (uint16_t)(ntohs(cqe->wqe_index) - (uint16_t)cur_qp->sq.tail); + ++cq->cons_index; + + return CQ_OK; +} + +static inline struct mlx4_cqe *get_next_cqe(struct mlx4_cq *cq, int const cqe_size) __attribute__((always_inline)); +static inline struct mlx4_cqe *get_next_cqe(struct mlx4_cq *cq, int const cqe_size) +{ + int cqe_off = (cqe_size & 64) >> 1; /* CQE offset is 32 bytes in case cqe_size == 64 */ + struct mlx4_cqe *cqe = cq->buf.buf + (cq->cons_index & cq->ibv_cq.cqe) * cqe_size + cqe_off; + + if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cq->cons_index & (cq->ibv_cq.cqe + 1))) + return NULL; + + VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + return cqe; +} + +static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size) __attribute__((always_inline)); +static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_cqe *cqe; + int npolled; + int err = CQ_OK; + + if (unlikely(use_lock)) + mlx4_lock(&cq->lock); + + for (npolled = 0; npolled < max_entries; ++npolled) { + cqe = get_next_cqe(cq, cqe_size); + if (!cqe) { + err = CQ_EMPTY; + break; + } + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + if (likely(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + err = update_sq_tail(cq, cqe, cq->last_qp); + else + err = drain_rx(cq, cqe, cq->last_qp, NULL, NULL); + + if (unlikely(err != CQ_OK)) + break; + } + + if (likely(npolled)) { + mlx4_update_cons_index(cq); + err = CQ_OK; + } + + if (unlikely(use_lock)) + mlx4_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? -1 : npolled; +} + +static inline int32_t get_flags(struct mlx4_qp *cur_qp, struct mlx4_cqe *cqe) __attribute__((always_inline)); +static inline int32_t get_flags(struct mlx4_qp *cur_qp, struct mlx4_cqe *cqe) +{ + /* Only ConnectX-3 Pro reports checksum for now) */ + if (likely(cur_qp && (cur_qp->qp_cap_cache & + MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP))) { + int32_t flags; + int32_t tmp; + + /* + * The relevant bits are in different locations on their + * CQE fields therefore we can join them in one 32bit + * variable. + */ + tmp = (cqe->badfcs_enc & MLX4_CQE_STATUS_L4_CSUM) | + (ntohs(cqe->status) & (MLX4_CQE_STATUS_IPOK | + MLX4_CQE_STATUS_IPV4 | + MLX4_CQE_STATUS_IPV6)) | + (ntohl(cqe->vlan_my_qpn) & (MLX4_CQE_L2_TUNNEL | + MLX4_CQE_L2_TUNNEL_IPOK | + MLX4_CQE_L2_TUNNEL_L4_CSUM | + MLX4_CQE_L2_TUNNEL_IPV4)); + if (likely(tmp == cur_qp->cached_rx_csum_flags)) { + flags = cur_qp->transposed_rx_csum_flags; + } else { + flags = mlx4_transpose(tmp, MLX4_CQE_STATUS_IPOK, IBV_EXP_CQ_RX_IP_CSUM_OK) | + mlx4_transpose(tmp, MLX4_CQE_STATUS_L4_CSUM, IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK) | + mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV4, IBV_EXP_CQ_RX_IPV4_PACKET) | + mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV6, IBV_EXP_CQ_RX_IPV6_PACKET) | + mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL, IBV_EXP_CQ_RX_TUNNEL_PACKET) | + mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPOK, IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK) | + mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_L4_CSUM, IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK) | + mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPV4, IBV_EXP_CQ_RX_OUTER_IPV4_PACKET) | + mlx4_transpose(~tmp, MLX4_CQE_L2_TUNNEL_IPV4, IBV_EXP_CQ_RX_OUTER_IPV6_PACKET); + cur_qp->cached_rx_csum_flags = tmp; + cur_qp->transposed_rx_csum_flags = flags; + } + + return flags; + } + + return 0; +} + +static inline int32_t poll_length(struct ibv_cq *ibcq, void *buf, uint32_t *inl, + const int use_lock, const int cqe_size, + uint32_t *flags) __attribute__((always_inline)); +static inline int32_t poll_length(struct ibv_cq *ibcq, void *buf, uint32_t *inl, + const int use_lock, const int cqe_size, + uint32_t *flags) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_cqe *cqe; + int32_t size = 0; + int err; + + if (unlikely(use_lock)) + mlx4_lock(&cq->lock); + + cqe = get_next_cqe(cq, cqe_size); + if (cqe) { + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + if (likely(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))) { + err = drain_rx(cq, cqe, cq->last_qp, buf, inl); + if (likely(err == CQ_OK)) { + size = ntohl(cqe->byte_cnt); + if (flags) + *flags = get_flags(cq->last_qp, cqe); + mlx4_update_cons_index(cq); + } + } else { + err = CQ_POLL_ERR; + } + + } else { + err = CQ_EMPTY; + } + + + if (unlikely(use_lock)) + mlx4_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? -1 : size; +} + +int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + + return poll_cnt(ibcq, max, 1, cq->cqe_size); +} + +int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + + return poll_cnt(ibcq, max, 0, cq->cqe_size); +} + +int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max) +{ + return poll_cnt(ibcq, max, 0, 32); +} + +int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max) +{ + return poll_cnt(ibcq, max, 0, 64); +} + +int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max) +{ + return poll_cnt(ibcq, max, 0, 128); +} + +int32_t mlx4_poll_length_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + + return poll_length(ibcq, buf, inl, 1, cq->cqe_size, NULL); +} + +int32_t mlx4_poll_length_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + + return poll_length(ibcq, buf, inl, 0, cq->cqe_size, NULL); +} + +int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl) +{ + return poll_length(cq, buf, inl, 0, 32, NULL); +} + +int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl) +{ + return poll_length(cq, buf, inl, 0, 64, NULL); +} + +int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl) +{ + return poll_length(cq, buf, inl, 0, 128, NULL); +} + +int32_t mlx4_poll_length_flags_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_flags_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + + return poll_length(ibcq, buf, inl, 1, cq->cqe_size, flags); +} + +int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + + return poll_length(ibcq, buf, inl, 0, cq->cqe_size, flags); +} + +int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) +{ + return poll_length(cq, buf, inl, 0, 32, flags); +} + +int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) +{ + return poll_length(cq, buf, inl, 0, 64, flags); +} + +int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__; +int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) +{ + return poll_length(cq, buf, inl, 0, 128, flags); +} + +static struct ibv_exp_cq_family mlx4_poll_cq_family_safe = { + .poll_cnt = mlx4_poll_cnt_safe, + .poll_length = mlx4_poll_length_safe, + .poll_length_flags = mlx4_poll_length_flags_safe +}; + +enum mlx4_poll_cq_cqe_sizes { + MLX4_POLL_CQ_CQE_32 = 0, + MLX4_POLL_CQ_CQE_64 = 1, + MLX4_POLL_CQ_CQE_128 = 2, + MLX4_POLL_CQ_CQE_OTHER = 3, + MLX4_POLL_CQ_NUM_CQE_SIZES = 4, +}; + +static struct ibv_exp_cq_family mlx4_poll_cq_family_unsafe_tbl[MLX4_POLL_CQ_NUM_CQE_SIZES] = { + [MLX4_POLL_CQ_CQE_32] = { + .poll_cnt = mlx4_poll_cnt_unsafe_cqe32, + .poll_length = mlx4_poll_length_unsafe_cqe32, + .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe32 + }, + [MLX4_POLL_CQ_CQE_64] = { + .poll_cnt = mlx4_poll_cnt_unsafe_cqe64, + .poll_length = mlx4_poll_length_unsafe_cqe64, + .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe64 + }, + [MLX4_POLL_CQ_CQE_128] = { + .poll_cnt = mlx4_poll_cnt_unsafe_cqe128, + .poll_length = mlx4_poll_length_unsafe_cqe128, + .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe128 + }, + [MLX4_POLL_CQ_CQE_OTHER] = { + .poll_cnt = mlx4_poll_cnt_unsafe_other, + .poll_length = mlx4_poll_length_unsafe_other, + .poll_length_flags = mlx4_poll_length_flags_unsafe_other + }, +}; + +struct ibv_exp_cq_family *mlx4_get_poll_cq_family(struct mlx4_cq *cq, + struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status) +{ + enum mlx4_poll_cq_cqe_sizes cqe_size = MLX4_POLL_CQ_CQE_OTHER; + + if (params->flags) { + fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for CQ family\n", params->flags); + *status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED; + + return NULL; + } + if (params->family_flags) { + fprintf(stderr, PFX "Family flags(0x%x) are not supported for CQ family\n", params->family_flags); + *status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED; + + return NULL; + } + + if (cq->model_flags & MLX4_CQ_MODEL_FLAG_THREAD_SAFE) + return &mlx4_poll_cq_family_safe; + + if (cq->cqe_size == 32) + cqe_size = MLX4_POLL_CQ_CQE_32; + else if (cq->cqe_size == 64) + cqe_size = MLX4_POLL_CQ_CQE_64; + else if (cq->cqe_size == 128) + cqe_size = MLX4_POLL_CQ_CQE_128; + + return &mlx4_poll_cq_family_unsafe_tbl[cqe_size]; +} Index: contrib/ofed/libmlx4/src/doorbell.h =================================================================== --- contrib/ofed/libmlx4/src/doorbell.h +++ contrib/ofed/libmlx4/src/doorbell.h @@ -33,7 +33,8 @@ #ifndef DOORBELL_H #define DOORBELL_H -#ifdef __LP64__ +#if __LP64__ + #if __BYTE_ORDER == __LITTLE_ENDIAN # define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0]) #elif __BYTE_ORDER == __BIG_ENDIAN @@ -51,10 +52,10 @@ static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset) { - pthread_spin_lock(&ctx->uar_lock); + mlx4_spin_lock(&ctx->uar_lock); *(volatile uint32_t *) (ctx->uar + offset) = val[0]; *(volatile uint32_t *) (ctx->uar + offset + 4) = val[1]; - pthread_spin_unlock(&ctx->uar_lock); + mlx4_spin_unlock(&ctx->uar_lock); } #endif Index: contrib/ofed/libmlx4/src/list.h =================================================================== --- /dev/null +++ contrib/ofed/libmlx4/src/list.h @@ -0,0 +1,330 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +/** + * list_empty_careful - tests whether a list is + * empty _and_ checks that no other CPU might be + * in the process of still modifying either member + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + * + * @head: the list to test. + */ +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +/** + * container_of - cast a member of a structure out to the containing structure + * + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member)*__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; prefetch(pos->next), pos != (head); \ + pos->next) + +/** + * __list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + * + * This variant differs from list_for_each() in that it's the + * simplest possible list iteration code, no prefetching is done. + * Use this for code that knows the list to be very short (empty + * or 1 entry) most of the time. + */ +#define __list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \ + pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + prefetch(pos->member.next), &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + prefetch(pos->member.prev), &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * list_prepare_entry - prepare a pos entry for use as a start point in + * list_for_each_entry_continue + * @pos: the type * to use as a start point + * @head: the head of the list + * @member: the name of the list_struct within the struct. + */ +#define list_prepare_entry(pos, head, member) \ + ((pos) ? : list_entry(head, typeof(*pos), member)) + +/** + * list_for_each_entry_continue - iterate over list of given type + * continuing after existing point + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + prefetch(pos->member.next), &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#endif + Index: contrib/ofed/libmlx4/src/mlx4-abi.h =================================================================== --- contrib/ofed/libmlx4/src/mlx4-abi.h +++ contrib/ofed/libmlx4/src/mlx4-abi.h @@ -35,14 +35,22 @@ #include -#define MLX4_UVERBS_MIN_ABI_VERSION 2 +#define MLX4_UVERBS_MIN_ABI_VERSION 3 #define MLX4_UVERBS_MAX_ABI_VERSION 4 +enum { + MLX4_USER_DEV_CAP_64B_CQE = 1L << 0, +#ifdef MLX4_WQE_FORMAT + MLX4_USER_DEV_CAP_WQE_FORMAT = 1L << 1 +#endif +}; + struct mlx4_alloc_ucontext_resp_v3 { struct ibv_get_context_resp ibv_resp; __u32 qp_tab_size; __u16 bf_reg_size; __u16 bf_regs_per_page; + __u32 cqe_size; }; struct mlx4_alloc_ucontext_resp { @@ -54,6 +62,14 @@ __u32 cqe_size; }; +struct mlx4_alloc_ucontext_req { + struct ibv_get_context cmd; +#ifdef MLX4_WQE_FORMAT + __u32 lib_caps; + __u32 reserved; +#endif +}; + struct mlx4_alloc_pd_resp { struct ibv_alloc_pd_resp ibv_resp; __u32 pdn; @@ -77,16 +93,14 @@ __u64 buf_addr; }; -#ifdef HAVE_IBV_XRC_OPS -struct mlx4_create_xrc_srq { - struct ibv_create_xrc_srq ibv_cmd; +struct mlx4_create_srq { + struct ibv_create_srq ibv_cmd; __u64 buf_addr; __u64 db_addr; }; -#endif -struct mlx4_create_srq { - struct ibv_create_srq ibv_cmd; +struct mlx4_create_xsrq { + struct ibv_create_xsrq ibv_cmd; __u64 buf_addr; __u64 db_addr; }; @@ -97,8 +111,7 @@ __u32 reserved; }; -struct mlx4_create_qp { - struct ibv_create_qp ibv_cmd; +struct mlx4_create_qp_base { __u64 buf_addr; __u64 db_addr; __u8 log_sq_bb_count; @@ -107,12 +120,14 @@ __u8 reserved[5]; }; -#ifdef HAVE_IBV_XRC_OPS -struct mlx4_open_xrc_domain_resp { - struct ibv_open_xrc_domain_resp ibv_resp; - __u32 xrcdn; - __u32 reserved; +struct mlx4_exp_create_qp_provider { + struct mlx4_create_qp_base base; + __u64 uar_virt_add; +}; + +struct mlx4_create_qp { + struct ibv_create_qp ibv_cmd; + struct mlx4_create_qp_base base; }; -#endif #endif /* MLX4_ABI_H */ Index: contrib/ofed/libmlx4/src/mlx4.h =================================================================== --- contrib/ofed/libmlx4/src/mlx4.h +++ contrib/ofed/libmlx4/src/mlx4.h @@ -34,10 +34,32 @@ #ifndef MLX4_H #define MLX4_H +#include #include +#include +#include #include +#include #include +#include +#include + +#define MLX4_MMAP_CMD_BITS 8 +#define MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD 2 +#define MLX4_IB_MMAP_GET_HW_CLOCK 3 + +/* Use EXP mmap commands until it is pushed to upstream */ +#define MLX4_IB_EXP_MMAP_EXT_UAR_PAGE 0xFE +#define MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE 0xFF + +#define MLX4_IB_MMAP_CMD_MASK 0xFF +#define MLX4_CQ_PREFIX "MLX_CQ" +#define MLX4_QP_PREFIX "MLX_QP" +#define MLX4_MR_PREFIX "MLX_MR" +#define MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE 23 +#define MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE 12 +#define MLX4_PORTS_NUM 2 #ifdef HAVE_VALGRIND_MEMCHECK_H @@ -69,7 +91,7 @@ #if defined(__i386__) #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") -#elif defined(__x86_64__) +#elif defined(__amd64__) #define wc_wmb() asm volatile("sfence" ::: "memory") #elif defined(__ia64__) #define wc_wmb() asm volatile("fwb" ::: "memory") @@ -79,29 +101,93 @@ #endif -#ifndef HAVE_IBV_MORE_OPS -#undef HAVE_IBV_XRC_OPS -#undef HAVE_IBV_CREATE_QP_EXP -#endif - #define HIDDEN __attribute__((visibility ("hidden"))) +#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if MLX4_GCC_VERSION >= 403 +# define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64))) +# define __MLX4_ALGN_DATA__ __attribute__((aligned(64))) +#else +# define __MLX4_ALGN_FUNC__ +# define __MLX4_ALGN_DATA__ +#endif + #define PFX "mlx4: " #ifndef max -#define max(a,b) \ +#define max(a, b) \ ({ typeof (a) _a = (a); \ typeof (b) _b = (b); \ _a > _b ? _a : _b; }) #endif #ifndef min -#define min(a,b) \ +#define min(a, b) \ ({ typeof (a) _a = (a); \ typeof (b) _b = (b); \ _a < _b ? _a : _b; }) #endif +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x),1) +#else +#define likely(x) (x) +#endif +#endif + + +#ifndef unlikely +#ifdef __GNUC__ +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define unlikely(x) (x) +#endif +#endif + +#ifndef uninitialized_var +#define uninitialized_var(x) x = x +#endif + +#include "list.h" + +/****************************************/ +/* ioctl codes */ +/****************************************/ +#define MLX4_IOC_MAGIC 'm' +#define MLX4_IOCHWCLOCKOFFSET _IOR(MLX4_IOC_MAGIC, 1, int) + +/* Generic macro to convert MLX4 to IBV flags. */ +#define MLX4_TRANSPOSE(val, from, to) \ + (((from) >= (to)) ? \ + (((val) & (from)) / ((from) / (to))) : \ + (((val) & (from)) * ((to) / (from)))) + +static inline uint64_t mlx4_transpose_uint16_t(uint16_t val, uint16_t from, uint64_t to) +{ + return MLX4_TRANSPOSE(val, from, to); +} + +static inline uint64_t mlx4_transpose_uint32_t(uint32_t val, uint32_t from, uint64_t to) +{ + return MLX4_TRANSPOSE(val, from, to); +} + +static inline uint32_t mlx4_transpose(uint32_t val, uint32_t from, uint32_t to) +{ + return MLX4_TRANSPOSE(val, from, to); +} + +enum { + MLX4_MAX_FAMILY_VER = 0 +}; + +enum { + MLX4_MAX_BFS_IN_PAGE = 8, + MLX4_BFS_STRIDE = 512, +}; + enum { MLX4_STAT_RATE_OFFSET = 5 }; @@ -112,14 +198,86 @@ MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1 }; +#define MLX4_REMOTE_SRQN_FLAGS(wr) htonl(wr->qp_type.xrc.remote_srqn << 8) + enum { - MLX4_XRC_SRQ_TABLE_BITS = 8, - MLX4_XRC_SRQ_TABLE_SIZE = 1 << MLX4_XRC_SRQ_TABLE_BITS, - MLX4_XRC_SRQ_TABLE_MASK = MLX4_XRC_SRQ_TABLE_SIZE - 1 + MLX4_XSRQ_TABLE_BITS = 8, + MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS, + MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1 }; enum { - MLX4_XRC_QPN_BIT = (1 << 23) + MLX4_QP_PATTERN = 0x012389AB, + MLX4_CQ_PATTERN = 0x4567CDEF +}; + +enum mlx4_lock_type { + MLX4_SPIN_LOCK = 0, + MLX4_MUTEX = 1, +}; + +enum mlx4_lock_state { + MLX4_USE_LOCK, + MLX4_LOCKED, + MLX4_UNLOCKED +}; + +/* QP DoorBell ringing methods */ +enum mlx4_db_method { + MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB,/* QP has dedicated BF, */ + /* only one thread is using this QP, */ + /* the arch supports WC auto evict and */ + /* prefer_bf flag is set. */ + /* This means that there is no need for */ + /* wc_wmb to flush the WC buffer */ + MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB, /* Same as previous but prefer_bf */ + /* flag is not set */ + MLX4_QP_DB_METHOD_DEDIC_BF, /* QP has dedicated BF */ + MLX4_QP_DB_METHOD_BF, /* QP has BF which may be shared with other QPs */ + MLX4_QP_DB_METHOD_DB /* BF is not valid for this QP, use DoorBell to send the messages */ +}; + +enum mlx4_res_domain_bf_type { + MLX4_RES_DOMAIN_BF_NONE, /* No BF for this resource domain */ + MLX4_RES_DOMAIN_BF_SAFE, /* Use BF when possible */ + MLX4_RES_DOMAIN_BF_UNSAFE, /* Use BF when possible. */ + /* The application is responsible to sync between */ + /* calls to objects using this resource domain. */ + /* This means that there is no need to use the BF */ + /* lock. */ + MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT, /* Use BF when possible. */ + /* Only one thread is using this resource */ + /* and the arch supports WC auto-evict. */ + /* This means that there is no need to use */ + /* wc_wmb function to flush the BF buffer */ + +}; + +struct mlx4_xsrq_table { + struct { + struct mlx4_srq **table; + int refcnt; + } xsrq_table[MLX4_XSRQ_TABLE_SIZE]; + + pthread_mutex_t mutex; + int num_xsrq; + int shift; + int mask; +}; + +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq); +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); + +enum { + MLX4_XRC_QPN_BIT = (1 << 23) +}; + +enum qp_cap_cache { + /* The flag below includes VXLAN support as well in mlx4 HW*/ + MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP = 1 << 1 }; enum mlx4_db_type { @@ -128,6 +286,15 @@ MLX4_NUM_DB_TYPE }; +enum mlx4_alloc_type { + MLX4_ALLOC_TYPE_ANON, + MLX4_ALLOC_TYPE_HUGE, + MLX4_ALLOC_TYPE_CONTIG, + MLX4_ALLOC_TYPE_PREFER_HUGE, + MLX4_ALLOC_TYPE_PREFER_CONTIG, + MLX4_ALLOC_TYPE_ALL +}; + enum { MLX4_OPCODE_NOP = 0x00, MLX4_OPCODE_SEND_INVAL = 0x01, @@ -146,6 +313,12 @@ MLX4_OPCODE_LOCAL_INVAL = 0x1b, MLX4_OPCODE_CONFIG_CMD = 0x1f, + MLX4_OPCODE_SEND_ENABLE = 0x17, + MLX4_OPCODE_RECV_ENABLE = 0x16, + MLX4_OPCODE_CQE_WAIT = 0x0f, + MLX4_OPCODE_CALC_SEND = 0x1e, + MLX4_OPCODE_CALC_RDMA_WRITE_IMM = 0x1f, + MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, MLX4_RECV_OPCODE_SEND = 0x01, MLX4_RECV_OPCODE_SEND_IMM = 0x02, @@ -155,28 +328,86 @@ MLX4_CQE_OPCODE_RESIZE = 0x16, }; +extern int mlx4_stall_num_loop; +extern int mlx4_trace; +extern int mlx4_single_threaded; +extern int mlx4_use_mutex; + enum { MLX4_MAX_WQE_SIZE = 1008 }; struct mlx4_device { - struct ibv_device ibv_dev; + struct verbs_device verbs_dev; int page_size; - int driver_abi_ver; + + struct { + unsigned id; + unsigned short rev; + } devid; + int driver_abi_ver; }; struct mlx4_db_page; +struct mlx4_lock { + pthread_mutex_t mutex; + pthread_spinlock_t slock; + enum mlx4_lock_state state; + enum mlx4_lock_type type; +}; + +struct mlx4_spinlock { + pthread_spinlock_t lock; + enum mlx4_lock_state state; +}; + +/* struct for BF dedicated for one QP */ +struct mlx4_dedic_bf { + void *address; +}; + +/* struct for the common BF which may be shared by many QPs */ +struct mlx4_cmn_bf { + void *address; + /* + * Protect usage of BF address field including data written to the BF + * and the BF buffer toggling. + */ + struct mlx4_lock lock; +}; + +union mlx4_bf { + struct mlx4_dedic_bf dedic; + struct mlx4_cmn_bf cmn; +}; + +struct mlx4_bfs_data { + struct mlx4_dedic_bf dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1]; + struct mlx4_cmn_bf cmn_bf; + uint8_t dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1]; + uint8_t dedic_bf_free; + struct mlx4_spinlock dedic_bf_lock; /* protect dedicated BFs managing */ + /* including dedic_bf_used and */ + /* dedic_bf_free fields */ + void *page; + uint16_t buf_size; + uint8_t num_dedic_bfs; +}; + struct mlx4_context { - struct ibv_context ibv_ctx; + union { + struct ibv_context ibv_ctx; + }; + struct mlx4_spinlock send_db_lock; /* protects send_db_list and send_db_num_uars */ + struct list_head send_db_list; + unsigned int send_db_num_uars; void *uar; - pthread_spinlock_t uar_lock; - - void *bf_page; - int bf_buf_size; - int bf_offset; - pthread_spinlock_t bf_lock; + struct mlx4_spinlock uar_lock; + struct mlx4_bfs_data bfs; + int bf_regs_per_page; + int max_ctx_res_domain; struct { struct mlx4_qp **table; @@ -189,24 +420,39 @@ int max_qp_wr; int max_sge; int max_cqe; - int cqe_size; - + uint64_t exp_device_cap_flags; struct { - struct mlx4_srq **table; - int refcnt; - } xrc_srq_table[MLX4_XRC_SRQ_TABLE_SIZE]; - pthread_mutex_t xrc_srq_table_mutex; - int num_xrc_srqs; - int xrc_srq_table_shift; - int xrc_srq_table_mask; + int offset; + int mult; + int shift; + uint64_t mask; + } core_clk; + void *hca_core_clock; + + struct mlx4_xsrq_table xsrq_table; struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE]; pthread_mutex_t db_list_mutex; + int cqe_size; + int prefer_bf; + struct mlx4_spinlock hugetlb_lock; + struct list_head hugetlb_list; + int stall_enable; + pthread_mutex_t task_mutex; + struct { + uint8_t valid; + uint8_t link_layer; + enum ibv_port_cap_flags caps; + } port_query_cache[MLX4_PORTS_NUM]; + pthread_mutex_t env_mtx; + int env_initialized; }; struct mlx4_buf { void *buf; + void *hmem; size_t length; + int base; }; struct mlx4_pd { @@ -214,23 +460,40 @@ uint32_t pdn; }; +enum mlx4_cq_model_flags { + /* + * When set the CQ API must be thread safe. + * When reset application is taking care + * to sync between CQ API calls. + */ + MLX4_CQ_MODEL_FLAG_THREAD_SAFE = 1 << 0, +}; + struct mlx4_cq { - struct ibv_cq ibv_cq; + struct ibv_cq ibv_cq __MLX4_ALGN_DATA__; + uint32_t pattern; struct mlx4_buf buf; struct mlx4_buf resize_buf; - pthread_spinlock_t lock; + struct mlx4_lock lock; uint32_t cqn; uint32_t cons_index; + uint32_t wait_index; + uint32_t wait_count; uint32_t *set_ci_db; uint32_t *arm_db; int arm_sn; - int cqe_size; + int stall_next_poll; + int stall_enable; + int cqe_size; + int creation_flags; + struct mlx4_qp *last_qp; + uint32_t model_flags; /* use mlx4_cq_model_flags */ }; struct mlx4_srq { - struct ibv_srq ibv_srq; + struct verbs_srq verbs_srq; struct mlx4_buf buf; - pthread_spinlock_t lock; + struct mlx4_spinlock lock; uint64_t *wrid; uint32_t srqn; int max; @@ -240,33 +503,102 @@ int tail; uint32_t *db; uint16_t counter; + uint8_t ext_srq; + struct ibv_srq_legacy *ibv_srq_legacy; }; struct mlx4_wq { uint64_t *wrid; - pthread_spinlock_t lock; + struct mlx4_lock lock; int wqe_cnt; int max_post; + char *buf; unsigned head; unsigned tail; int max_gs; int wqe_shift; - int offset; + + /* SEND/RECV_ENABLE data */ + unsigned head_en_index; + unsigned head_en_count; +}; + +/* enclosing ibv_mr adding some extra managing information */ +struct mlx4_mr { + struct ibv_mr ibv_mr; + struct mlx4_buf buf; + uint64_t allocation_flags; + int shared_mr; +}; + + +struct mlx4_inlr_rbuff { + void *rbuff; + int rlen; +}; + +struct mlx4_inlr_sg_list { + struct mlx4_inlr_rbuff *sg_list; + int list_len; +}; + +struct mlx4_inlr_buff { + struct mlx4_inlr_sg_list *buff; + int len; +}; + +struct mlx4_send_db_data { + union mlx4_bf bf; + uint32_t *db_addr; /* Points to the BF related send DB */ + struct list_head list; +}; + +enum mlx4_qp_model_flags { + /* + * When set the QP API must be thread safe. + * When reset application is taking care + * to sync between QP API calls. + */ + MLX4_QP_MODEL_FLAG_THREAD_SAFE = 1 << 0, }; struct mlx4_qp { - struct ibv_qp ibv_qp; - struct mlx4_buf buf; - int max_inline_data; + struct verbs_qp verbs_qp; + uint32_t pattern; int buf_size; - + uint32_t model_flags; /* use mlx4_qp_model_flags */ + + /* hot post send data */ + struct mlx4_wq sq __MLX4_ALGN_DATA__; + int (*post_send_one)(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe, int *total_size, + int *inl, unsigned int ind); + union mlx4_bf *bf; + uint32_t *sdb; /* send DB */ + struct mlx4_buf buf; + unsigned last_db_head; uint32_t doorbell_qpn; - uint32_t sq_signal_bits; - int sq_spare_wqes; - struct mlx4_wq sq; - + uint32_t create_flags; + uint16_t max_inline_data; + uint16_t bf_buf_size; + uint16_t sq_spare_wqes; + uint8_t srcrb_flags_tbl[16]; + uint8_t db_method; + uint8_t qp_type; + /* RAW_PACKET hot data */ + uint8_t link_layer; + /* EXT_MASKED_ATOMIC hot data */ + uint8_t is_masked_atomic; + + /* post receive hot data */ + struct mlx4_wq rq __MLX4_ALGN_DATA__; uint32_t *db; - struct mlx4_wq rq; + uint32_t max_inlr_sg; + int32_t cached_rx_csum_flags; + int32_t transposed_rx_csum_flags; + struct mlx4_inlr_buff inlr_buff; + uint8_t qp_cap_cache; }; struct mlx4_av { @@ -280,7 +612,6 @@ uint8_t hop_limit; uint32_t sl_tclass_flowlabel; uint8_t dgid[16]; - uint8_t mac[8]; }; struct mlx4_ah { @@ -288,18 +619,20 @@ struct mlx4_av av; uint16_t vlan; uint8_t mac[6]; - uint8_t tagged; }; -struct mlx4_xrc_domain { - struct ibv_xrc_domain ibv_xrcd; - uint32_t xrcdn; +struct mlx4_res_domain { + struct ibv_exp_res_domain ibv_res_domain; + struct ibv_exp_res_domain_init_attr attr; + enum mlx4_res_domain_bf_type type; + struct mlx4_send_db_data *send_db; }; static inline unsigned long align(unsigned long val, unsigned long align) { return (val + align - 1) & ~(align - 1); } +int align_queue_size(int req); #define to_mxxx(xxx, type) \ ((struct mlx4_##type *) \ @@ -307,7 +640,10 @@ static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev) { - return to_mxxx(dev, device); + /* ibv_device is first field of verbs_device + * see try_driver in libibverbs. + */ + return container_of(ibdev, struct mlx4_device, verbs_dev); } static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx) @@ -327,32 +663,53 @@ static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq) { - return to_mxxx(srq, srq); + return container_of(container_of(ibsrq, struct verbs_srq, srq), + struct mlx4_srq, verbs_srq); } static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp) { - return to_mxxx(qp, qp); + return container_of(container_of(ibqp, struct verbs_qp, qp), + struct mlx4_qp, verbs_qp); } +static inline struct mlx4_mr *to_mmr(struct ibv_mr *ibmr) +{ + return to_mxxx(mr, mr); +} static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah) { return to_mxxx(ah, ah); } -#ifdef HAVE_IBV_XRC_OPS -static inline struct mlx4_xrc_domain *to_mxrcd(struct ibv_xrc_domain *ibxrcd) +static inline struct mlx4_res_domain *to_mres_domain(struct ibv_exp_res_domain *ibres_domain) { - return to_mxxx(xrcd, xrc_domain); + return to_mxxx(res_domain, res_domain); } -#endif +int update_port_data(struct ibv_qp *qp, uint8_t port_num); int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size); void mlx4_free_buf(struct mlx4_buf *buf); +int mlx4_alloc_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf, + size_t size, int page_size); +int mlx4_alloc_buf_contig(struct mlx4_context *mctx, struct mlx4_buf *buf, + size_t size, int page_size, const char *component, void *req_addr); +int mlx4_alloc_prefered_buf(struct mlx4_context *mctx, + struct mlx4_buf *buf, + size_t size, int page_size, + enum mlx4_alloc_type alloc_type, + const char *component); +void mlx4_get_alloc_type(struct ibv_context *context, const char *component, + enum mlx4_alloc_type *alloc_type, + enum mlx4_alloc_type default_alloc_type); +void mlx4_free_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf); +int mlx4_use_huge(struct ibv_context *context, const char *key); uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type); void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db); +int __mlx4_query_device(uint64_t raw_fw_ver, + struct ibv_device_attr *attr); int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr); int mlx4_query_port(struct ibv_context *context, uint8_t port, @@ -360,19 +717,42 @@ struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context); int mlx4_free_pd(struct ibv_pd *pd); +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *attr); +int mlx4_close_xrcd(struct ibv_xrcd *xrcd); struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, - size_t length, enum ibv_access_flags access); + size_t length, int access); +struct ibv_mr *mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in *in); +int mlx4_exp_post_send(struct ibv_qp *ibqp, struct ibv_exp_send_wr *wr, + struct ibv_exp_send_wr **bad_wr); +void mlx4_update_post_send_one(struct mlx4_qp *qp); +struct ibv_exp_qp_burst_family *mlx4_get_qp_burst_family(struct mlx4_qp *qp, + struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status); +struct ibv_exp_cq_family *mlx4_get_poll_cq_family(struct mlx4_cq *cq, + struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status); + +struct ibv_mr *mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in); int mlx4_dereg_mr(struct ibv_mr *mr); +struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type); +int mlx4_dealloc_mw(struct ibv_mw *mw); +int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); +int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind); + struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector); -int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, +int mlx4_alloc_cq_buf(struct mlx4_context *mctx, struct mlx4_buf *buf, int nent, int entry_size); int mlx4_resize_cq(struct ibv_cq *cq, int cqe); int mlx4_destroy_cq(struct ibv_cq *cq); -int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx4_poll_ibv_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries, + struct ibv_exp_wc *wc, uint32_t wc_size) __MLX4_ALGN_FUNC__; int mlx4_arm_cq(struct ibv_cq *cq, int solicited); void mlx4_cq_event(struct ibv_cq *cq); void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); @@ -382,76 +762,207 @@ struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *attr); +struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex); +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex); int mlx4_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, - enum ibv_srq_attr_mask mask); + int mask); int mlx4_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr); int mlx4_destroy_srq(struct ibv_srq *srq); +int mlx4_destroy_xrc_srq(struct ibv_srq *srq); int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, struct mlx4_srq *srq); +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq); +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind); int mlx4_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); -struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn); -int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn, - struct mlx4_srq *srq); -void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn); struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); +struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr); +int mlx4_modify_cq(struct ibv_cq *cq, struct ibv_exp_cq_attr *attr, int attr_mask); +int mlx4_post_task(struct ibv_context *context, + struct ibv_exp_task *task_list, + struct ibv_exp_task **bad_task); +struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr); int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, - enum ibv_qp_attr_mask attr_mask, + int attr_mask, struct ibv_qp_init_attr *init_attr); int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, - enum ibv_qp_attr_mask attr_mask); + int attr_mask); +int mlx4_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, + uint64_t attr_mask); int mlx4_destroy_qp(struct ibv_qp *qp); +void *mlx4_get_recv_wqe(struct mlx4_qp *qp, int n); void mlx4_init_qp_indices(struct mlx4_qp *qp); void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp); int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, - struct ibv_send_wr **bad_wr); + struct ibv_send_wr **bad_wr) __MLX4_ALGN_FUNC__; int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, - struct ibv_recv_wr **bad_wr); + struct ibv_recv_wr **bad_wr) __MLX4_ALGN_FUNC__; void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, struct mlx4_qp *qp); int num_inline_segs(int data, enum ibv_qp_type type); -int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, - enum ibv_qp_type type, struct mlx4_qp *qp); +void mlx4_dealloc_qp_buf(struct ibv_context *context, struct mlx4_qp *qp); void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, enum ibv_qp_type type); struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn); int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp); void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn); +struct ibv_ah *mlx4_create_ah_common(struct ibv_pd *pd, + struct ibv_ah_attr *attr, + uint8_t link_layer); struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); +struct ibv_ah *mlx4_exp_create_ah(struct ibv_pd *pd, + struct ibv_exp_ah_attr *attr_ex); int mlx4_destroy_ah(struct ibv_ah *ah); int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr, struct mlx4_ah *ah); void mlx4_free_av(struct mlx4_ah *ah); -#ifdef HAVE_IBV_XRC_OPS -struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd, - struct ibv_xrc_domain *xrc_domain, - struct ibv_cq *xrc_cq, - struct ibv_srq_init_attr *attr); -struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context, - int fd, int oflag); - -int mlx4_close_xrc_domain(struct ibv_xrc_domain *d); -int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_qp_num); -int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, - int attr_mask); -int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, - int attr_mask, - struct ibv_qp_init_attr *init_attr); -int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num); -int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num); -#endif +struct ibv_cq *mlx4_create_cq_ex(struct ibv_context *context, + int cqe, + struct ibv_comp_channel *channel, + int comp_vector, + struct ibv_exp_cq_init_attr *attr); +int mlx4_query_values(struct ibv_context *context, int q_values, + struct ibv_exp_values *values); +void *mlx4_get_legacy_xrc(struct ibv_srq *srq); +void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq); +void read_init_vars(struct mlx4_context *ctx); + +static inline enum mlx4_lock_type mlx4_get_locktype(void) +{ + if (!mlx4_use_mutex) + return MLX4_SPIN_LOCK; + + return MLX4_MUTEX; +} + +static inline int mlx4_spin_lock(struct mlx4_spinlock *lock) +{ + if (lock->state == MLX4_USE_LOCK) + return pthread_spin_lock(&lock->lock); + + if (unlikely(lock->state == MLX4_LOCKED)) { + fprintf(stderr, "*** ERROR: multithreading violation ***\n" + "You are running a multithreaded application but\n" + "you set MLX4_SINGLE_THREADED=1. Please unset it.\n"); + abort(); + } else { + lock->state = MLX4_LOCKED; + wmb(); + } + + return 0; +} + +static inline int mlx4_spin_unlock(struct mlx4_spinlock *lock) +{ + if (lock->state == MLX4_USE_LOCK) + return pthread_spin_unlock(&lock->lock); + + lock->state = MLX4_UNLOCKED; + + return 0; +} + +static inline int mlx4_lock(struct mlx4_lock *lock) +{ + if (lock->state == MLX4_USE_LOCK) { + if (lock->type == MLX4_SPIN_LOCK) + return pthread_spin_lock(&lock->slock); + + return pthread_mutex_lock(&lock->mutex); + } + + if (unlikely(lock->state == MLX4_LOCKED)) { + fprintf(stderr, "*** ERROR: multithreading violation ***\n" + "You are running a multithreaded application but\n" + "you set MLX4_SINGLE_THREADED=1. Please unset it.\n"); + abort(); + } else { + lock->state = MLX4_LOCKED; + /* Make new state visable to other threads. */ + wmb(); + } + + return 0; +} + +static inline int mlx4_unlock(struct mlx4_lock *lock) +{ + if (lock->state == MLX4_USE_LOCK) { + if (lock->type == MLX4_SPIN_LOCK) + return pthread_spin_unlock(&lock->slock); + + return pthread_mutex_unlock(&lock->mutex); + } + lock->state = MLX4_UNLOCKED; + + return 0; +} +static inline int mlx4_spinlock_init(struct mlx4_spinlock *lock, int use_spinlock) +{ + if (use_spinlock) { + lock->state = MLX4_USE_LOCK; + return pthread_spin_init(&lock->lock, PTHREAD_PROCESS_PRIVATE); + } + lock->state = MLX4_UNLOCKED; + + return 0; +} + +static inline int mlx4_spinlock_destroy(struct mlx4_spinlock *lock) +{ + if (lock->state == MLX4_USE_LOCK) + return pthread_spin_destroy(&lock->lock); + + return 0; +} + +static inline int mlx4_lock_init(struct mlx4_lock *lock, + int use_lock, + enum mlx4_lock_type lock_type) +{ + if (use_lock) { + lock->type = lock_type; + lock->state = MLX4_USE_LOCK; + if (lock->type == MLX4_SPIN_LOCK) + return pthread_spin_init(&lock->slock, + PTHREAD_PROCESS_PRIVATE); + + return pthread_mutex_init(&lock->mutex, + PTHREAD_PROCESS_PRIVATE); + } + lock->state = MLX4_UNLOCKED; + + return 0; +} + +static inline int mlx4_lock_destroy(struct mlx4_lock *lock) +{ + if (lock->state == MLX4_USE_LOCK) { + if (lock->type == MLX4_SPIN_LOCK) + return pthread_spin_destroy(&lock->slock); + + return pthread_mutex_destroy(&lock->mutex); + } + + return 0; +} + +static inline void mlx4_update_cons_index(struct mlx4_cq *cq) +{ + *cq->set_ci_db = htonl(cq->cons_index & 0xffffff); +} #endif /* MLX4_H */ Index: contrib/ofed/libmlx4/src/mlx4.c =================================================================== --- contrib/ofed/libmlx4/src/mlx4.c +++ contrib/ofed/libmlx4/src/mlx4.c @@ -41,18 +41,27 @@ #include #include #include - +#include +#include +#include #ifndef HAVE_IBV_REGISTER_DRIVER #include #endif +#include #include "mlx4.h" #include "mlx4-abi.h" +#include "mlx4_exp.h" + #ifndef PCI_VENDOR_ID_MELLANOX #define PCI_VENDOR_ID_MELLANOX 0x15b3 #endif +int mlx4_trace = 0; +int mlx4_single_threaded = 0; +int mlx4_use_mutex = 0; + #define HCA(v, d) \ { .vendor = PCI_VENDOR_ID_##v, \ .device = d } @@ -66,47 +75,30 @@ HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */ HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */ HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */ - HCA(MELLANOX, 0x6368), /* MT25448 [ConnectX EN 10GigE, PCIe 2.0 2.5GT/s] */ - HCA(MELLANOX, 0x6750), /* MT26448 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */ - HCA(MELLANOX, 0x6372), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe 2.0 2.5GT/s] */ - HCA(MELLANOX, 0x675a), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe Gen2 5GT/s] */ - HCA(MELLANOX, 0x6764), /* MT26468 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */ - HCA(MELLANOX, 0x6746), /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */ - HCA(MELLANOX, 0x676e), /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */ - HCA(MELLANOX, 0x6778), /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */ - HCA(MELLANOX, 0x1000), - HCA(MELLANOX, 0x1001), - HCA(MELLANOX, 0x1002), - HCA(MELLANOX, 0x1003), - HCA(MELLANOX, 0x1004), - HCA(MELLANOX, 0x1005), - HCA(MELLANOX, 0x1006), - HCA(MELLANOX, 0x1007), - HCA(MELLANOX, 0x1008), - HCA(MELLANOX, 0x1009), - HCA(MELLANOX, 0x100a), - HCA(MELLANOX, 0x100b), - HCA(MELLANOX, 0x100c), - HCA(MELLANOX, 0x100d), - HCA(MELLANOX, 0x100e), - HCA(MELLANOX, 0x100f), + HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */ + HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ + HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */ + HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */ + HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */ + HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */ + HCA(MELLANOX, 0x1005), /* MT27510 Family */ + HCA(MELLANOX, 0x1006), /* MT27511 Family */ + HCA(MELLANOX, 0x1007), /* MT27520 Family */ + HCA(MELLANOX, 0x1008), /* MT27521 Family */ + HCA(MELLANOX, 0x1009), /* MT27530 Family */ + HCA(MELLANOX, 0x100a), /* MT27531 Family */ + HCA(MELLANOX, 0x100b), /* MT27540 Family */ + HCA(MELLANOX, 0x100c), /* MT27541 Family */ + HCA(MELLANOX, 0x100d), /* MT27550 Family */ + HCA(MELLANOX, 0x100e), /* MT27551 Family */ + HCA(MELLANOX, 0x100f), /* MT27560 Family */ + HCA(MELLANOX, 0x1010), /* MT27561 Family */ }; -#ifdef HAVE_IBV_MORE_OPS -static struct ibv_more_ops mlx4_more_ops = { -#ifdef HAVE_IBV_XRC_OPS - .create_xrc_srq = mlx4_create_xrc_srq, - .open_xrc_domain = mlx4_open_xrc_domain, - .close_xrc_domain = mlx4_close_xrc_domain, - .create_xrc_rcv_qp = mlx4_create_xrc_rcv_qp, - .modify_xrc_rcv_qp = mlx4_modify_xrc_rcv_qp, - .query_xrc_rcv_qp = mlx4_query_xrc_rcv_qp, - .reg_xrc_rcv_qp = mlx4_reg_xrc_rcv_qp, - .unreg_xrc_rcv_qp = mlx4_unreg_xrc_rcv_qp, -#endif -}; -#endif - static struct ibv_context_ops mlx4_ctx_ops = { .query_device = mlx4_query_device, .query_port = mlx4_query_port, @@ -114,8 +106,11 @@ .dealloc_pd = mlx4_free_pd, .reg_mr = mlx4_reg_mr, .dereg_mr = mlx4_dereg_mr, + .alloc_mw = mlx4_alloc_mw, + .dealloc_mw = mlx4_dealloc_mw, + .bind_mw = mlx4_bind_mw, .create_cq = mlx4_create_cq, - .poll_cq = mlx4_poll_cq, + .poll_cq = mlx4_poll_ibv_cq, .req_notify_cq = mlx4_arm_cq, .cq_event = mlx4_cq_event, .resize_cq = mlx4_resize_cq, @@ -137,150 +132,592 @@ .detach_mcast = ibv_cmd_detach_mcast }; -static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, int cmd_fd) +static int read_number_from_line(const char *line, int *value) { - struct mlx4_context *context; - struct ibv_get_context cmd; - struct mlx4_alloc_ucontext_resp resp; - struct mlx4_alloc_ucontext_resp_v3 resp_v3; - int i; - struct ibv_device_attr dev_attrs; - unsigned int bf_reg_size; + const char *ptr; - context = calloc(1, sizeof *context); - if (!context) - return NULL; + ptr = strchr(line, ':'); + if (!ptr) + return 1; + + ++ptr; + + *value = atoi(ptr); + return 0; +} + +static int mlx4_is_sandy_bridge(int *num_cores) +{ + char line[128]; + FILE *fd; + int rc = 0; + int cur_cpu_family = -1; + int cur_cpu_model = -1; + + fd = fopen("/proc/cpuinfo", "r"); + if (!fd) + return 0; + + *num_cores = 0; + + while (fgets(line, 128, fd)) { + int value; + + /* if this is information on new processor */ + if (!strncmp(line, "processor", 9)) { + ++*num_cores; + + cur_cpu_family = -1; + cur_cpu_model = -1; + } else if (!strncmp(line, "cpu family", 10)) { + if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) + cur_cpu_family = value; + } else if (!strncmp(line, "model", 5)) { + if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) + cur_cpu_model = value; + } + + /* if this is a Sandy Bridge CPU */ + if ((cur_cpu_family == 6) && + (cur_cpu_model == 0x2A || cur_cpu_model == 0x2D)) + rc = 1; + } + + fclose(fd); + return rc; +} + +static void mlx4_check_numa_enabled(struct ibv_context *context) +{ + char fname[MAXPATHLEN]; + char buf[128]; + FILE *fp; + int numa_enabled; + char env[VERBS_MAX_ENV_VAL]; + + snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/numa_node", + ibv_get_device_name(context->device)); + + fp = fopen(fname, "r"); + if (!fp) { + fprintf(stderr, PFX "Warning: can not check if NUMA is enabled " + "on node: failed to open %s\n", fname); + return; + } + + if (!fgets(buf, sizeof(buf), fp)) { + fprintf(stderr, PFX "Warning: can not check if NUMA is enabled " + "on node: failed to read numa node value\n"); + goto out; + } + + numa_enabled = (strtol(buf, 0, 10) >= 0); + if (numa_enabled) + printf(PFX "Device NUMA node detection is supported\n"); + else if (ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env, sizeof(env))) + printf(PFX "Warning: Device NUMA node detection is not supported. " + "Please consider setting the environment variable " + "'MLX4_LOCAL_CPUS' or enable ACPI SLIT\n"); +out: + fclose(fp); +} + +static void dump_cpu_set(cpuset_t *cpu_set) +{ + int i; + int first_cpu = -1; + int last_cpu = -1; + int n = 0; + + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, cpu_set)) { + if (first_cpu < 0) + first_cpu = i; + if (i == CPU_SETSIZE - 1) + last_cpu = i; + } else if (first_cpu >= 0) + last_cpu = i - 1; + + if (last_cpu >= 0) { + if (first_cpu != last_cpu) + printf("%s%d-%d", n ? "," : "", first_cpu, + last_cpu); + else + printf("%s%d", n ? "," : "", last_cpu); + + first_cpu = -1; + last_cpu = -1; + ++n; + } + } +} + +/* +man cpuset + + This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words + are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between + words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits + within a word are also in big-endian order. + + The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on + the size of the bitmask. + + Examples of the Mask Format: + + 00000001 # just bit 0 set + 40000000,00000000,00000000 # just bit 94 set + 000000ff,00000000 # bits 32-39 set + 00000000,000E3862 # 1,5,6,11-13,17-19 set + + A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: + + 00000001,00000001,00010117 + + The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for + bit 4, and the "7" is for bits 2, 1, and 0. +*/ +static void mlx4_local_cpu_set(struct ibv_context *context, cpuset_t *cpu_set) +{ + char *p, buf[1024]; + char env_value[VERBS_MAX_ENV_VAL]; + uint32_t word; + int i, k; + + if (mlx4_trace) + mlx4_check_numa_enabled(context); + + if (!ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env_value, sizeof(env_value))) { + strncpy(buf, env_value, sizeof(buf)); + if (mlx4_trace) + printf(PFX "Local CPUs flags were override by %s\n", buf); + } else { + char fname[MAXPATHLEN]; + FILE *fp; + + snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/local_cpus", + ibv_get_device_name(context->device)); + + fp = fopen(fname, "r"); + if (!fp) { + fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); + return; + } + if (!fgets(buf, sizeof(buf), fp)) { + fprintf(stderr, PFX "Warning: can not get local cpu set: failed to read cpu mask\n"); + fclose(fp); + return; + } + fclose(fp); + } - context->ibv_ctx.cmd_fd = cmd_fd; + p = strrchr(buf, ','); + if (!p) + p = buf; - if (to_mdev(ibdev)->driver_abi_ver > 3) { - if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd, - &resp.ibv_resp, sizeof resp)) - goto err_free; + i = 0; + do { + if (*p == ',') { + *p = 0; + p ++; + } + + word = strtoul(p, 0, 16); + + for (k = 0; word; ++k, word >>= 1) + if (word & 1) + CPU_SET(k+i, cpu_set); + + if (p == buf) + break; + + p = strrchr(buf, ','); + if (!p) + p = buf; + + i += 32; + } while (i < CPU_SETSIZE); +} + +static int mlx4_enable_sandy_bridge_fix(struct ibv_context *context) +{ + cpuset_t my_cpus, dev_local_cpus, result_set; + int stall_enable; + int ret; + int num_cores; + + if (!mlx4_is_sandy_bridge(&num_cores)) + return 0; + + /* by default disable stall on sandy bridge arch */ + stall_enable = 0; + + /* + * check if app is bound to cpu set that is inside + * of device local cpu set. Disable stalling if true + */ + + /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ + CPU_ZERO(&my_cpus); + CPU_ZERO(&dev_local_cpus); + CPU_ZERO(&result_set); + ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, + sizeof(my_cpus), &my_cpus); + if (ret == -1) { + if (errno == EINVAL) + fprintf(stderr, PFX "Warning: my cpu set is too small\n"); + else + fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); + goto out; + } + + if (mlx4_trace) { + printf(PFX "Running on cpus: "); + dump_cpu_set(&my_cpus); + printf("\n"); + } + + /* get device local cpu set */ + mlx4_local_cpu_set(context, &dev_local_cpus); + + /* make sure result_set is not init to all 0 */ + CPU_SET(0, &result_set); + /* Set stall_enable if my cpu set and dev cpu set are disjoint sets */ + CPU_AND(&result_set, &my_cpus); + CPU_AND(&result_set, &dev_local_cpus); + stall_enable = CPU_COUNT(&result_set) ? 0 : 1; + + if (mlx4_trace) { + printf(PFX "HCA:%s local cpus: ", ibv_get_device_name(context->device)); + dump_cpu_set(&dev_local_cpus); + printf("\n"); + if (CPU_COUNT(&my_cpus) == num_cores) { + printf(PFX "Warning: CPU affinity wasn't used for this " + "process, if the system has more than one numa node, it might be using a remote one.\n"); + printf(PFX " For achieving better performance, " + "please consider setting the CPU " + "affinity.\n"); + } + } + +out: + if (mlx4_trace) + printf(PFX "Sandy Bridge CPU was detected, cq_stall is %s\n", + stall_enable ? "enabled" : "disabled"); + + return stall_enable; +} + +static void mlx4_read_env(struct ibv_device *ibdev, struct mlx4_context *ctx) +{ + char env_value[VERBS_MAX_ENV_VAL]; + + if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_TRACE", env_value, sizeof(env_value)) && + (strcmp(env_value, "0"))) + mlx4_trace = 1; + + if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_CQ_POLL", env_value, sizeof(env_value)) && + !strcmp(env_value, "0")) + /* check if cq stall is overrided by user */ + ctx->stall_enable = 0; + else + /* autodetect if we need to do cq polling */ + ctx->stall_enable = mlx4_enable_sandy_bridge_fix(&ctx->ibv_ctx); + + if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_NUM_LOOP", env_value, sizeof(env_value))) + mlx4_stall_num_loop = atoi(env_value); + + if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_SINGLE_THREADED", env_value, sizeof(env_value))) + mlx4_single_threaded = strcmp(env_value, "1") ? 0 : 1; + + if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, + "MLX4_USE_MUTEX", + env_value, + sizeof(env_value))) + mlx4_use_mutex = strcmp(env_value, "1") ? 0 : 1; +} + +void read_init_vars(struct mlx4_context *ctx) +{ + char env_value[VERBS_MAX_ENV_VAL]; + + pthread_mutex_lock(&ctx->env_mtx); + if (!ctx->env_initialized) { + mlx4_read_env(ctx->ibv_ctx.device, ctx); + if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_POST_SEND_PREFER_BF", env_value, sizeof(env_value))) { + ctx->prefer_bf = !!strcmp(env_value, "0"); + if (mlx4_trace) + printf(PFX "prefer_bf=%d\n", ctx->prefer_bf); + } else { + ctx->prefer_bf = 1; + } - context->num_qps = resp.qp_tab_size; - context->num_xrc_srqs = resp.qp_tab_size; - bf_reg_size = resp.bf_reg_size; - context->cqe_size = resp.cqe_size; + ctx->env_initialized = 1; + } + pthread_mutex_unlock(&ctx->env_mtx); +} + +static int mlx4_init_context(struct verbs_device *v_device, + struct ibv_context *ibv_ctx, int cmd_fd) +{ + struct mlx4_context *context; + struct mlx4_alloc_ucontext_req req; + struct mlx4_alloc_ucontext_resp resp; + struct mlx4_alloc_ucontext_resp_v3 resp_v3; + int i; + struct ibv_exp_device_attr dev_attrs; + struct ibv_device_attr dev_legacy_attrs; + struct mlx4_device *dev = to_mdev(&v_device->device); + unsigned int qp_tab_size; + unsigned int bf_reg_size; + unsigned int cqe_size; + int hca_clock_offset; + void *hca_clock_page = NULL; + + /* verbs_context should be used for new verbs. + * memory footprint of mlx4_context and verbs_context share + * struct ibv_context. + */ + struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx); + struct verbs_context_exp *verbs_exp_ctx = verbs_get_exp_ctx(ibv_ctx); + + memset(&req, 0, sizeof(req)); + context = to_mctx(ibv_ctx); + ibv_ctx->cmd_fd = cmd_fd; + ibv_ctx->device = &v_device->device; + + if (pthread_mutex_init(&context->env_mtx, NULL)) + return EIO; + + if (dev->driver_abi_ver > 3) { +#ifdef MLX4_WQE_FORMAT + req.lib_caps = MLX4_USER_DEV_CAP_WQE_FORMAT; +#endif + if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req), + &resp.ibv_resp, sizeof(resp))) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); + qp_tab_size = resp.qp_tab_size; + bf_reg_size = resp.bf_reg_size; + context->bf_regs_per_page = resp.bf_regs_per_page; + cqe_size = resp.cqe_size; } else { - if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd, - &resp_v3.ibv_resp, sizeof resp_v3)) - goto err_free; - - context->num_qps = resp_v3.qp_tab_size; - context->num_xrc_srqs = resp_v3.qp_tab_size; - bf_reg_size = resp_v3.bf_reg_size; - context->cqe_size = 32; + if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req.cmd), + &resp_v3.ibv_resp, sizeof(resp_v3))) + return errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp_v3, sizeof(resp_v3)); + qp_tab_size = resp_v3.qp_tab_size; + bf_reg_size = resp_v3.bf_reg_size; + context->bf_regs_per_page = resp_v3.bf_regs_per_page; + cqe_size = 32; } + context->num_qps = qp_tab_size; context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS; context->qp_table_mask = (1 << context->qp_table_shift) - 1; + context->cqe_size = cqe_size; + for (i = 0; i < MLX4_PORTS_NUM; ++i) + context->port_query_cache[i].valid = 0; pthread_mutex_init(&context->qp_table_mutex, NULL); for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i) context->qp_table[i].refcnt = 0; - context->xrc_srq_table_shift = ffs(context->num_xrc_srqs) - 1 - - MLX4_XRC_SRQ_TABLE_BITS; - context->xrc_srq_table_mask = (1 << context->xrc_srq_table_shift) - 1; - - pthread_mutex_init(&context->xrc_srq_table_mutex, NULL); - for (i = 0; i < MLX4_XRC_SRQ_TABLE_SIZE; ++i) - context->xrc_srq_table[i].refcnt = 0; - for (i = 0; i < MLX4_NUM_DB_TYPE; ++i) context->db_list[i] = NULL; + mlx4_init_xsrq_table(&context->xsrq_table, qp_tab_size); pthread_mutex_init(&context->db_list_mutex, NULL); - context->uar = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE, + context->uar = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, 0); if (context->uar == MAP_FAILED) - goto err_free; + return errno; if (bf_reg_size) { - context->bf_page = mmap(NULL, to_mdev(ibdev)->page_size, - PROT_WRITE, MAP_SHARED, cmd_fd, - to_mdev(ibdev)->page_size); - if (context->bf_page == MAP_FAILED) { + context->bfs.page = mmap(NULL, dev->page_size, + PROT_WRITE, MAP_SHARED, cmd_fd, + dev->page_size); + if (context->bfs.page == MAP_FAILED) { fprintf(stderr, PFX "Warning: BlueFlame available, " "but failed to mmap() BlueFlame page.\n"); - context->bf_page = NULL; - context->bf_buf_size = 0; + context->bfs.page = NULL; + context->bfs.buf_size = 0; + context->bfs.num_dedic_bfs = 0; } else { - context->bf_buf_size = bf_reg_size / 2; - context->bf_offset = 0; - pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE); + context->bfs.num_dedic_bfs = min(context->bf_regs_per_page - 1, + MLX4_MAX_BFS_IN_PAGE - 1); + context->bfs.buf_size = bf_reg_size / 2; + mlx4_spinlock_init(&context->bfs.dedic_bf_lock, !mlx4_single_threaded); + context->bfs.cmn_bf.address = context->bfs.page; + + mlx4_lock_init(&context->bfs.cmn_bf.lock, + !mlx4_single_threaded, + mlx4_get_locktype()); + + context->bfs.dedic_bf_free = context->bfs.num_dedic_bfs; + for (i = 0; i < context->bfs.num_dedic_bfs; i++) { + context->bfs.dedic_bf[i].address = context->bfs.page + (i + 1) * MLX4_BFS_STRIDE; + context->bfs.dedic_bf_used[i] = 0; + } } } else { - context->bf_page = NULL; - context->bf_buf_size = 0; + context->bfs.page = NULL; + context->bfs.buf_size = 0; + context->bfs.num_dedic_bfs = 0; } - pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + mlx4_spinlock_init(&context->uar_lock, !mlx4_single_threaded); - context->ibv_ctx.ops = mlx4_ctx_ops; -#ifdef HAVE_IBV_XRC_OPS - context->ibv_ctx.more_ops = &mlx4_more_ops; -#endif + mlx4_spinlock_init(&context->send_db_lock, !mlx4_single_threaded); + INIT_LIST_HEAD(&context->send_db_list); + + mlx4_spinlock_init(&context->hugetlb_lock, !mlx4_single_threaded); + INIT_LIST_HEAD(&context->hugetlb_list); - if (mlx4_query_device(&context->ibv_ctx, &dev_attrs)) - goto query_free; + pthread_mutex_init(&context->task_mutex, NULL); + + memset(&dev_attrs, 0, sizeof(dev_attrs)); + dev_attrs.comp_mask = IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK | + IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK | + IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS | + IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN; + + if (mlx4_exp_query_device(ibv_ctx, &dev_attrs)) { + if (mlx4_query_device(ibv_ctx, &dev_legacy_attrs)) + goto query_free; + + memcpy(&dev_attrs, &dev_legacy_attrs, sizeof(dev_legacy_attrs)); + } context->max_qp_wr = dev_attrs.max_qp_wr; context->max_sge = dev_attrs.max_sge; context->max_cqe = dev_attrs.max_cqe; - if (!(dev_attrs.device_cap_flags & IBV_DEVICE_XRC)) { - fprintf(stderr, PFX "There is a mismatch between " - "the kernel and the userspace libraries: " - "Kernel does not support XRC. Exiting.\n"); - goto query_free; + context->exp_device_cap_flags = dev_attrs.exp_device_cap_flags; + if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN) + context->max_ctx_res_domain = dev_attrs.max_ctx_res_domain; + + VALGRIND_MAKE_MEM_DEFINED(&context->hca_core_clock, sizeof(context->hca_core_clock)); + if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { + if (dev_attrs.hca_core_clock) + context->core_clk.mult = ((1ull * 1000) << 29) / + dev_attrs.hca_core_clock; + else + context->core_clk.mult = 0; + + context->core_clk.shift = 29; + context->core_clk.mask = dev_attrs.timestamp_mask; + + if (ioctl(cmd_fd, MLX4_IOCHWCLOCKOFFSET, + &hca_clock_offset) >= 0) { + VALGRIND_MAKE_MEM_DEFINED(&hca_clock_offset, sizeof(hca_clock_offset)); + context->core_clk.offset = hca_clock_offset; + hca_clock_page = mmap(NULL, hca_clock_offset + + sizeof(context->core_clk.mask), + PROT_READ, MAP_SHARED, cmd_fd, + dev->page_size * + (MLX4_IB_MMAP_GET_HW_CLOCK)); + + if (hca_clock_page == MAP_FAILED) { + fprintf(stderr, PFX + "Warning: Timestamp available,\n" + "but failed to mmap() hca core " + "clock page.\n"); + } else { + context->hca_core_clock = hca_clock_page + + context->core_clk.offset; + } + } } - return &context->ibv_ctx; + ibv_ctx->ops = mlx4_ctx_ops; + + verbs_ctx->has_comp_mask |= VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ | + VERBS_CONTEXT_QP; + + verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd); + verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd); + verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex); + verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num); + verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex); + verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp); + verbs_set_ctx_op(verbs_ctx, create_flow, ibv_cmd_create_flow); + verbs_set_ctx_op(verbs_ctx, destroy_flow, ibv_cmd_destroy_flow); + + /* + * Set experimental verbs + */ + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_reg_shared_mr, mlx4_reg_shared_mr); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_flow, ibv_exp_cmd_create_flow); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_destroy_flow, ibv_exp_cmd_destroy_flow); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_ah, mlx4_exp_create_ah); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_device, mlx4_exp_query_device); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_create_qp, mlx4_exp_create_qp); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_qp, mlx4_exp_modify_qp); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_port, mlx4_exp_query_port); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_cq, mlx4_modify_cq); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_task, mlx4_post_task); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_set_legacy_xrc, mlx4_set_legacy_xrc); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_get_legacy_xrc, mlx4_get_legacy_xrc); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx4_exp_poll_cq); + verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_cq, mlx4_create_cq_ex); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_values, mlx4_query_values); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_reg_mr, mlx4_exp_reg_mr); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_send, mlx4_exp_post_send); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_bind_mw, mlx4_exp_bind_mw); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_rereg_mr, mlx4_exp_rereg_mr); + verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_dereg_mr, mlx4_exp_dereg_mr); + verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_res_domain, mlx4_exp_create_res_domain); + verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_res_domain, mlx4_exp_destroy_res_domain); + verbs_set_exp_ctx_op(verbs_exp_ctx, exp_query_intf, mlx4_exp_query_intf); + verbs_set_exp_ctx_op(verbs_exp_ctx, exp_release_intf, mlx4_exp_release_intf); + + return 0; query_free: - munmap(context->uar, to_mdev(ibdev)->page_size); - if (context->bf_page) - munmap(context->bf_page, to_mdev(ibdev)->page_size); - -err_free: - free(context); - return NULL; + munmap(context->uar, dev->page_size); + if (context->bfs.page) + munmap(context->bfs.page, dev->page_size); + if (hca_clock_page) + munmap(hca_clock_page, hca_clock_offset + + sizeof(context->core_clk.mask)); + + return errno; } -static void mlx4_free_context(struct ibv_context *ibctx) +static void mlx4_uninit_context(struct verbs_device *v_device, + struct ibv_context *ibv_ctx) { - struct mlx4_context *context = to_mctx(ibctx); - - munmap(context->uar, to_mdev(ibctx->device)->page_size); - if (context->bf_page) - munmap(context->bf_page, to_mdev(ibctx->device)->page_size); - free(context); + struct mlx4_context *context = to_mctx(ibv_ctx); + + munmap(context->uar, to_mdev(&v_device->device)->page_size); + if (context->bfs.page) + munmap(context->bfs.page, + to_mdev(&v_device->device)->page_size); + if (context->hca_core_clock) + munmap((context->hca_core_clock - context->core_clk.offset), + context->core_clk.offset + sizeof(context->core_clk.mask)); } -static struct ibv_device_ops mlx4_dev_ops = { - .alloc_context = mlx4_alloc_context, - .free_context = mlx4_free_context -}; - -static struct ibv_device *mlx4_driver_init(const char *uverbs_sys_path, - int abi_version) +static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path, + int abi_version) { char value[8]; - struct mlx4_device *dev; + struct mlx4_device *dev; unsigned vendor, device; int i; if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", value, sizeof value) < 0) return NULL; - sscanf(value, "%i", &vendor); + vendor = strtol(value, NULL, 16); if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", value, sizeof value) < 0) return NULL; - sscanf(value, "%i", &device); + device = strtol(value, NULL, 16); for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) if (vendor == hca_table[i].vendor && @@ -300,24 +737,32 @@ return NULL; } - dev = malloc(sizeof *dev); + dev = calloc(1, sizeof(*dev)); if (!dev) { fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", uverbs_sys_path); return NULL; } - dev->ibv_dev.ops = mlx4_dev_ops; dev->page_size = sysconf(_SC_PAGESIZE); + + dev->devid.id = device; dev->driver_abi_ver = abi_version; - return &dev->ibv_dev; + dev->verbs_dev.sz = sizeof(*dev); + dev->verbs_dev.size_of_context = + sizeof(struct mlx4_context) - sizeof(struct ibv_context); + /* mlx4_init_context will initialize provider calls */ + dev->verbs_dev.init_context = mlx4_init_context; + dev->verbs_dev.uninit_context = mlx4_uninit_context; + + return &dev->verbs_dev; } #ifdef HAVE_IBV_REGISTER_DRIVER static __attribute__((constructor)) void mlx4_register_driver(void) { - ibv_register_driver("mlx4", mlx4_driver_init); + verbs_register_driver("mlx4", mlx4_driver_init); } #else /* Index: contrib/ofed/libmlx4/src/mlx4_exp.h =================================================================== --- /dev/null +++ contrib/ofed/libmlx4/src/mlx4_exp.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_EXP_H +#define MLX4_EXP_H + +#include +#include "mlx4.h" + +/* + * mlx4-abi experimental structs + */ +struct mlx4_exp_create_qp { + struct ibv_exp_create_qp ibv_cmd; + struct mlx4_exp_create_qp_provider exp_cmd; +}; + +struct mlx4_exp_create_cq { + struct ibv_exp_create_cq ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +/* + * Experimental functions + */ +struct ibv_qp *mlx4_exp_create_qp(struct ibv_context *context, + struct ibv_exp_qp_init_attr *attr); +int mlx4_exp_query_device(struct ibv_context *context, + struct ibv_exp_device_attr *attr); +int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_exp_port_attr *port_attr); +int mlx4_exp_modify_cq(struct ibv_cq *cq, struct ibv_exp_cq_attr *attr, + int attr_mask); +int mlx4_exp_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd, + void *addr, size_t length, uint64_t access, + struct ibv_exp_rereg_mr_attr *attr, struct ibv_exp_rereg_out *out); +int mlx4_exp_dereg_mr(struct ibv_mr *mr, struct ibv_exp_dereg_out *out); +struct ibv_exp_res_domain *mlx4_exp_create_res_domain(struct ibv_context *context, + struct ibv_exp_res_domain_init_attr *attr); +int mlx4_exp_destroy_res_domain(struct ibv_context *context, + struct ibv_exp_res_domain *res_dom, + struct ibv_exp_destroy_res_domain_attr *attr); +void *mlx4_exp_query_intf(struct ibv_context *context, struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status); +int mlx4_exp_release_intf(struct ibv_context *context, void *intf, + struct ibv_exp_release_intf_params *params); + +#endif /* MLX4_EXP_H */ Index: contrib/ofed/libmlx4/src/qp.c =================================================================== --- contrib/ofed/libmlx4/src/qp.c +++ contrib/ofed/libmlx4/src/qp.c @@ -40,11 +40,40 @@ #include #include #include +#include #include "mlx4.h" #include "doorbell.h" #include "wqe.h" +#ifndef htobe64 +#include +# if __BYTE_ORDER == __LITTLE_ENDIAN +# define htobe64(x) __bswap_64 (x) +# else +# define htobe64(x) (x) +# endif +#endif + +#ifdef MLX4_WQE_FORMAT + #define SET_BYTE_COUNT(byte_count) (htonl(byte_count) | owner_bit) + #define WQE_CTRL_OWN (1 << 30) +#else + #define SET_BYTE_COUNT(byte_count) htonl(byte_count) + #define WQE_CTRL_OWN (1 << 31) +#endif +enum { + MLX4_OPCODE_BASIC = 0x00010000, + MLX4_OPCODE_MANAGED = 0x00020000, + + MLX4_OPCODE_WITH_IMM = 0x01000000 +}; + +#define MLX4_IB_OPCODE(op, class, attr) (((class) & 0x00FF0000) | ((attr) & 0xFF000000) | ((op) & 0x0000FFFF)) +#define MLX4_IB_OPCODE_GET_CLASS(opcode) ((opcode) & 0x00FF0000) +#define MLX4_IB_OPCODE_GET_OP(opcode) ((opcode) & 0x0000FFFF) +#define MLX4_IB_OPCODE_GET_ATTR(opcode) ((opcode) & 0xFF000000) + static const uint32_t mlx4_ib_opcode[] = { [IBV_WR_SEND] = MLX4_OPCODE_SEND, [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM, @@ -55,14 +84,151 @@ [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA, }; -static void *get_recv_wqe(struct mlx4_qp *qp, int n) + +static const uint32_t mlx4_ib_opcode_exp[] = { + [IBV_EXP_WR_SEND] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_SEND_WITH_IMM] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_IMM, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM), + [IBV_EXP_WR_RDMA_WRITE] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_RDMA_WRITE_WITH_IMM] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE_IMM, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM), + [IBV_EXP_WR_RDMA_READ] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_READ, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_ATOMIC_CMP_AND_SWP] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_CS, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_ATOMIC_FETCH_AND_ADD] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_FA, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_CS, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_FA, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_LOCAL_INV] = MLX4_IB_OPCODE(MLX4_OPCODE_LOCAL_INVAL, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_SEND_WITH_INV] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_INVAL, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM), + [IBV_EXP_WR_BIND_MW] = MLX4_IB_OPCODE(MLX4_OPCODE_BIND_MW, MLX4_OPCODE_BASIC, 0), + [IBV_EXP_WR_SEND_ENABLE] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_ENABLE, MLX4_OPCODE_MANAGED, 0), + [IBV_EXP_WR_RECV_ENABLE] = MLX4_IB_OPCODE(MLX4_OPCODE_RECV_ENABLE, MLX4_OPCODE_MANAGED, 0), + [IBV_EXP_WR_CQE_WAIT] = MLX4_IB_OPCODE(MLX4_OPCODE_CQE_WAIT, MLX4_OPCODE_MANAGED, 0), +}; + +enum { + MLX4_CALC_FLOAT64_ADD = 0x00, + MLX4_CALC_UINT64_ADD = 0x01, + MLX4_CALC_UINT64_MAXLOC = 0x02, + MLX4_CALC_UINT64_AND = 0x03, + MLX4_CALC_UINT64_XOR = 0x04, + MLX4_CALC_UINT64_OR = 0x05 +}; + +enum { + MLX4_WQE_CTRL_CALC_OP = 26 +}; + +static const struct mlx4_calc_op { + int valid; + uint32_t opcode; +} mlx4_calc_ops_table + [IBV_EXP_CALC_DATA_SIZE_NUMBER] + [IBV_EXP_CALC_OP_NUMBER] + [IBV_EXP_CALC_DATA_TYPE_NUMBER] = { + [IBV_EXP_CALC_DATA_SIZE_64_BIT] = { + [IBV_EXP_CALC_OP_ADD] = { + [IBV_EXP_CALC_DATA_TYPE_INT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_UINT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_FLOAT] = { + .valid = 1, + .opcode = MLX4_CALC_FLOAT64_ADD << MLX4_WQE_CTRL_CALC_OP } + }, + [IBV_EXP_CALC_OP_BXOR] = { + [IBV_EXP_CALC_DATA_TYPE_INT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_UINT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_FLOAT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP } + }, + [IBV_EXP_CALC_OP_BAND] = { + [IBV_EXP_CALC_DATA_TYPE_INT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_UINT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_FLOAT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP } + }, + [IBV_EXP_CALC_OP_BOR] = { + [IBV_EXP_CALC_DATA_TYPE_INT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_UINT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP }, + [IBV_EXP_CALC_DATA_TYPE_FLOAT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP } + }, + [IBV_EXP_CALC_OP_MAXLOC] = { + [IBV_EXP_CALC_DATA_TYPE_UINT] = { + .valid = 1, + .opcode = MLX4_CALC_UINT64_MAXLOC << MLX4_WQE_CTRL_CALC_OP } + } + } +}; + +static int post_send_other(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) __MLX4_ALGN_FUNC__; +static int post_send_rc_raw_packet(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) __MLX4_ALGN_FUNC__; +static int post_send_ud(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) __MLX4_ALGN_FUNC__; +static int post_send_rc_uc(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) __MLX4_ALGN_FUNC__; +static int post_send_xrc(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) __MLX4_ALGN_FUNC__; + +#define MLX4_WAIT_EN_VALID (1<<30) + +static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count) __attribute__((always_inline)); +static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count) { - return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); + struct mlx4_wqe_wait_en_seg *seg = (struct mlx4_wqe_wait_en_seg *)wqe_seg; + + seg->valid = htonl(MLX4_WAIT_EN_VALID); + seg->pi = htonl(count); + seg->obj_num = htonl(obj_num); + + return; } -static void *get_send_wqe(struct mlx4_qp *qp, int n) +static inline void *get_recv_wqe(struct mlx4_qp *qp, int n) __attribute__((always_inline)); +static inline void *get_recv_wqe(struct mlx4_qp *qp, int n) +{ + return qp->rq.buf + (n << qp->rq.wqe_shift); +} + +void *mlx4_get_recv_wqe(struct mlx4_qp *qp, int n) +{ + return get_recv_wqe(qp, n); +} + +static void *get_send_wqe64(struct mlx4_qp *qp, unsigned int n) +{ + return qp->sq.buf + (n << 6); +} +static void *get_send_wqe(struct mlx4_qp *qp, unsigned int n) { - return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift); + return qp->sq.buf + (n << qp->sq.wqe_shift); } /* @@ -70,7 +236,48 @@ * first four bytes of every 64 byte chunk with 0xffffffff, except for * the very first chunk of the WQE. */ -static void stamp_send_wqe(struct mlx4_qp *qp, int n) +void mlx4_init_qp_indices(struct mlx4_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head_en_index = 0; + qp->sq.head_en_count = 0; + qp->rq.head_en_index = 0; + qp->rq.head_en_count = 0; +} + +#ifdef MLX4_WQE_FORMAT +void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) +{ + __be32 *wqe = get_send_wqe(qp, 0); + int wq_size = (qp->sq.wqe_cnt << qp->sq.wqe_shift); + int i; + + for (i = 0; i < wq_size; i += 64) + wqe[i / 4] = htonl(WQE_CTRL_OWN); +} + +static void set_owner_wqe(struct mlx4_qp *qp, unsigned int idx, int ds, + uint32_t owner_bit) +{ + uint32_t *wqe; + int max_sz = (1 << qp->sq.wqe_shift) / 4; + int cur_sz = ds * 4; + int tail_sz; + int i; + + if (max_sz - cur_sz < 16) + return; + + wqe = get_send_wqe(qp, idx & (qp->sq.wqe_cnt - 1)); + tail_sz = max_sz - cur_sz; + for (i = 0; tail_sz > 16; i += 4, tail_sz -= 16) + wqe[cur_sz + i * 4] = owner_bit; +} +#else +static void stamp_send_wqe(struct mlx4_qp *qp, unsigned int n) { uint32_t *wqe = get_send_wqe(qp, n); int i; @@ -80,14 +287,6 @@ wqe[i] = 0xffffffff; } -void mlx4_init_qp_indices(struct mlx4_qp *qp) -{ - qp->sq.head = 0; - qp->sq.tail = 0; - qp->rq.head = 0; - qp->rq.tail = 0; -} - void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) { struct mlx4_wqe_ctrl_seg *ctrl; @@ -95,29 +294,78 @@ for (i = 0; i < qp->sq.wqe_cnt; ++i) { ctrl = get_send_wqe(qp, i); - ctrl->owner_opcode = htonl(1 << 31); + ctrl->owner_opcode = htonl(WQE_CTRL_OWN); ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); stamp_send_wqe(qp, i); } } +#endif -static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq) +static int __wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) __attribute__((noinline)); +static int __wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) { + struct mlx4_cq *cq = to_mcq(qp->verbs_qp.qp.send_cq); unsigned cur; + mlx4_lock(&cq->lock); cur = wq->head - wq->tail; - if (cur + nreq < wq->max_post) - return 0; + mlx4_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static inline int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) __attribute__((always_inline)); +static inline int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) +{ + unsigned cur; - pthread_spin_lock(&cq->lock); cur = wq->head - wq->tail; - pthread_spin_unlock(&cq->lock); + if (likely(cur + nreq < wq->max_post)) + return 0; - return cur + nreq >= wq->max_post; + return __wq_overflow(wq, nreq, qp); +} + +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_exp_send_wr *wr) +{ + uint64_t acc = wr->bind_mw.bind_info.exp_mw_access_flags; + bseg->flags1 = 0; + if (acc & IBV_EXP_ACCESS_REMOTE_ATOMIC) + bseg->flags1 |= htonl(MLX4_WQE_MW_ATOMIC); + if (acc & IBV_EXP_ACCESS_REMOTE_WRITE) + bseg->flags1 |= htonl(MLX4_WQE_MW_REMOTE_WRITE); + if (acc & IBV_EXP_ACCESS_REMOTE_READ) + bseg->flags1 |= htonl(MLX4_WQE_MW_REMOTE_READ); + + bseg->flags2 = 0; + if (((struct verbs_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2) + bseg->flags2 |= htonl(MLX4_WQE_BIND_TYPE_2); + if (acc & IBV_EXP_ACCESS_MW_ZERO_BASED) + bseg->flags2 |= htonl(MLX4_WQE_BIND_ZERO_BASED); + + bseg->new_rkey = htonl(wr->bind_mw.rkey); + bseg->lkey = htonl(wr->bind_mw.bind_info.mr->lkey); + bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr); + bseg->length = htobe64(wr->bind_mw.bind_info.length); +} + +static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, + uint32_t rkey) __attribute__((always_inline)); +static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, + uint32_t rkey) +{ + iseg->mem_key = htonl(rkey); + + iseg->reserved1 = 0; + iseg->reserved2 = 0; + iseg->reserved3[0] = 0; + iseg->reserved3[1] = 0; } static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + uint64_t remote_addr, uint32_t rkey) __attribute__((always_inline)); +static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, uint64_t remote_addr, uint32_t rkey) { rseg->raddr = htonll(remote_addr); @@ -125,16 +373,33 @@ rseg->reserved = 0; } -static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr) +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, + struct ibv_exp_send_wr *wr) { - if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + struct ibv_exp_fetch_add *fa; + + if (wr->exp_opcode == IBV_EXP_WR_ATOMIC_CMP_AND_SWP) { aseg->swap_add = htonll(wr->wr.atomic.swap); aseg->compare = htonll(wr->wr.atomic.compare_add); + } else if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) { + fa = &wr->ext_op.masked_atomics.wr_data.inline_data.op.fetch_add; + aseg->swap_add = htonll(fa->add_val); + aseg->compare = htonll(fa->field_boundary); } else { aseg->swap_add = htonll(wr->wr.atomic.compare_add); aseg->compare = 0; } +} + +static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, + struct ibv_exp_send_wr *wr) +{ + struct ibv_exp_cmp_swap *cs = &wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap; + aseg->swap_data = htonll(cs->swap_val); + aseg->cmp_data = htonll(cs->compare_val); + aseg->swap_mask = htonll(cs->swap_mask); + aseg->cmp_mask = htonll(cs->compare_mask); } static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, @@ -147,14 +412,18 @@ memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6); } -static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +static inline void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) __attribute__((always_inline)); +static inline void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) { dseg->byte_count = htonl(sg->length); dseg->lkey = htonl(sg->lkey); dseg->addr = htonll(sg->addr); } -static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg, + struct ibv_sge *sg, unsigned int owner_bit) __attribute__((always_inline)); +static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg, + struct ibv_sge *sg, unsigned int owner_bit) { dseg->lkey = htonl(sg->lkey); dseg->addr = htonll(sg->addr); @@ -169,7 +438,10 @@ */ wmb(); - dseg->byte_count = htonl(sg->length); + if (likely(sg->length)) + dseg->byte_count = SET_BYTE_COUNT(sg->length); + else + dseg->byte_count = htonl(0x80000000); } /* @@ -177,84 +449,787 @@ * implementations may use move-string-buffer assembler instructions, * which do not guarantee order of copying. */ -static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) +#if defined(__amd64__) +#define COPY_64B_WC(dst, src) \ + __asm__ __volatile__ ( \ + " movdqa (%1),%%xmm0\n" \ + " movdqa 16(%1),%%xmm1\n" \ + " movdqa 32(%1),%%xmm2\n" \ + " movdqa 48(%1),%%xmm3\n" \ + " movntdq %%xmm0, (%0)\n" \ + " movntdq %%xmm1, 16(%0)\n" \ + " movntdq %%xmm2, 32(%0)\n" \ + " movntdq %%xmm3, 48(%0)\n" \ + : : "r" (dst), "r" (src) : "memory"); \ + dst += 8; \ + src += 8 +#else +#define COPY_64B_WC(dst, src) \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++ +#endif + +static void mlx4_bf_copy(uint64_t *dst, uint64_t *src, unsigned bytecnt) { while (bytecnt > 0) { - *dst++ = *src++; - *dst++ = *src++; - bytecnt -= 2 * sizeof (long); + COPY_64B_WC(dst, src); + bytecnt -= 8 * sizeof(uint64_t); + } +} + +/* Convert WQE format to fit BF usage */ +static inline void convert_to_bf_wqe(struct mlx4_qp *qp, + struct mlx4_wqe_ctrl_seg *ctrl, + const unsigned wqe_idx) __attribute__((always_inline)); +static inline void convert_to_bf_wqe(struct mlx4_qp *qp, + struct mlx4_wqe_ctrl_seg *ctrl, + const unsigned wqe_idx) +{ + uint32_t *tmp = (uint32_t *)ctrl->reserved; + + ctrl->owner_opcode |= htonl((wqe_idx & 0xffff) << 8); + *tmp |= qp->doorbell_qpn; +} + +static inline void copy_wqe_to_bf(struct mlx4_qp *qp, + struct mlx4_wqe_ctrl_seg *ctrl, + const int aligned_size, + const unsigned wqe_idx, + const int dedic_bf, + const int one_thread_auto_evict) __attribute__((always_inline)); +static inline void copy_wqe_to_bf(struct mlx4_qp *qp, + struct mlx4_wqe_ctrl_seg *ctrl, + const int aligned_size, + const unsigned wqe_idx, + const int dedic_bf, + const int one_thread_auto_evict) +{ + convert_to_bf_wqe(qp, ctrl, wqe_idx); + + if (dedic_bf && one_thread_auto_evict) + /* + * In case QP has dedicated BF, only one thread using this QP + * and the CPU arch supports auto eviction of WC buffer we can move + * the wc_wmb before the bf_copy (usually it is located after the bf_copy). + * This provides significant improvement in message rate of small messages. + * This barrier keeps BF toggling order by ensuring that previous BF data + * is written to memory before writing to the next BF buffer. + */ + wc_wmb(); + else + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + wmb(); + + if (dedic_bf) { + mlx4_bf_copy(qp->bf->dedic.address, (uint64_t *) ctrl, aligned_size); + } else { + mlx4_lock(&qp->bf->cmn.lock); + mlx4_bf_copy(qp->bf->cmn.address, (uint64_t *) ctrl, aligned_size); + } + if (!(dedic_bf && one_thread_auto_evict)) + /* + * This barrier ensures that BF data is written to memory + * before toggling the BF buffer. This is to keep the right + * toggling order and to prevent the case in which next BF data + * will be written before the current BF data. + * In addition this barrier ensures the eviction of the WC buffer. + * See comment above for the conditions in which this barrier may be + * set before the bf_copy. + */ + wc_wmb(); + + if (dedic_bf) { + /* Toggle BF buffer */ + qp->bf->dedic.address = (void *)((uintptr_t)qp->bf->dedic.address ^ qp->bf_buf_size); + } else { + /* Toggle BF buffer */ + qp->bf->cmn.address = (void *)((uintptr_t)qp->bf->cmn.address ^ qp->bf_buf_size); + mlx4_unlock(&qp->bf->cmn.lock); + } +} + +static inline void __ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl, + int nreq, int size, int inl, + const int use_bf, const int dedic_bf, const int one_thread_auto_evict, + const int prefer_bf) __attribute__((always_inline)); +static inline void __ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl, + int nreq, int size, int inl, + const int use_bf, const int dedic_bf, const int one_thread_auto_evict, + const int prefer_bf) +{ + if (use_bf && nreq == 1 && (inl || prefer_bf) && + size > 1 && size <= qp->bf_buf_size / 16) { + copy_wqe_to_bf(qp, ctrl, align(size * 16, 64), + qp->sq.head , dedic_bf, + one_thread_auto_evict); + ++qp->sq.head; + } else if (likely(nreq)) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * ringing non-cached doorbell record. + */ + nc_wmb(); + *qp->sdb = qp->doorbell_qpn; + } +} + +static void __ring_db_mng(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl, + int nreq, int size, int inl) __attribute__((noinline)); +static void __ring_db_mng(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl, + int nreq, int size, int inl) +{ + struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context); + + if (nreq == 1 && (inl || ctx->prefer_bf) && size > 1 && size <= qp->bf_buf_size / 16) { + convert_to_bf_wqe(qp, ctrl, qp->sq.head); + + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + wmb(); + + ++qp->sq.head; + + wmb(); + + } else if (likely(nreq)) { + qp->sq.head += nreq; + + /* Controlled qp */ + wmb(); + } +} + +static inline void ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl, + int nreq, int size, int inl) __attribute__((always_inline)); +static inline void ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl, + int nreq, int size, int inl) +{ + if (unlikely(qp->create_flags & IBV_EXP_QP_CREATE_MANAGED_SEND)) + return __ring_db_mng(qp, ctrl, nreq, size, inl); + + switch (qp->db_method) { + case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB: + return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 1); + case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB: + return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 0); + case MLX4_QP_DB_METHOD_DEDIC_BF: + return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf); + case MLX4_QP_DB_METHOD_BF: + return __ring_db(qp, ctrl, nreq, size, inl, 1, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf); + case MLX4_QP_DB_METHOD_DB: + return __ring_db(qp, ctrl, nreq, size, inl, 0, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf); + } +} + +static void set_ctrl_seg(struct mlx4_wqe_ctrl_seg *ctrl, struct ibv_send_wr *wr, + struct mlx4_qp *qp, uint32_t imm, uint32_t srcrb_flags, + unsigned int owner_bit, int size, uint32_t wr_op) +{ + ctrl->srcrb_flags = srcrb_flags; + ctrl->imm = imm; + ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + wmb(); + ctrl->owner_opcode = htonl(wr_op) | owner_bit; +} + +static inline int set_data_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list, + void *wqe, int *size, unsigned int owner_bit) __attribute__((always_inline)); +static inline int set_data_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list, + void *wqe, int *size, unsigned int owner_bit) +{ + struct mlx4_wqe_inline_seg *seg; + void *addr; + int len, seg_len; + int num_seg; + int off, to_copy; + int i; + int inl = 0; + + seg = wqe; + wqe += sizeof(*seg); + off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); + num_seg = 0; + seg_len = 0; + + for (i = 0; i < num_sge; ++i) { + addr = (void *) (uintptr_t) sg_list[i].addr; + len = sg_list[i].length; + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return ENOMEM; + + while (len >= MLX4_INLINE_ALIGN - off) { + to_copy = MLX4_INLINE_ALIGN - off; + memcpy(wqe, addr, to_copy); + len -= to_copy; + wqe += to_copy; + addr += to_copy; + seg_len += to_copy; + wmb(); /* see comment below */ + seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_len)); + seg_len = 0; + seg = wqe; + wqe += sizeof(*seg); + off = sizeof(*seg); + ++num_seg; + } + + memcpy(wqe, addr, len); + wqe += len; + seg_len += len; + off += len; + } + + if (likely(seg_len)) { + ++num_seg; + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + wmb(); + seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_len)); + } + + *size += (inl + num_seg * sizeof(*seg) + 15) / 16; + + return 0; +} + +static inline void set_data_inl_seg_fast(struct mlx4_qp *qp, + void *addr, int length, + void *wqe, int *size, + unsigned int owner_bit) __attribute__((always_inline)); +static inline void set_data_inl_seg_fast(struct mlx4_qp *qp, + void *addr, int length, + void *wqe, int *size, + unsigned int owner_bit) +{ + struct mlx4_wqe_inline_seg *seg; + static const int first_seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg) - sizeof(struct mlx4_wqe_ctrl_seg); + static const int seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg); + + seg = wqe; + wqe += sizeof(*seg); + + if (length <= first_seg_data_size) { + /* For the first segment there is no need to make sure + * all the data is visible before the byte_count field is set. + * This is because the ctrl segment at the beginning of the + * segment covers HCA prefetcher issue. + */ + seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | length)); + + memcpy(wqe, addr, length); + *size += (length + sizeof(*seg) + 15) / 16; + } else { + void *start_wqe = seg; + + seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | first_seg_data_size)); + memcpy(wqe, addr, first_seg_data_size); + length -= first_seg_data_size; + addr += first_seg_data_size; + seg = (struct mlx4_wqe_inline_seg *)((char *)seg + MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg)); + wqe += MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg); + + while (length > seg_data_size) { + memcpy(wqe, addr, seg_data_size); + wmb(); /* see comment below */ + seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_data_size)); + length -= seg_data_size ; + addr += seg_data_size; + seg = (struct mlx4_wqe_inline_seg *)((char *)seg + MLX4_INLINE_ALIGN); + wqe += MLX4_INLINE_ALIGN; + } + memcpy(wqe, addr, length); + + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + wmb(); + seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | length)); + *size += (wqe + length - start_wqe + 15) / 16; + } +} + +static inline void set_data_non_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list, + void *wqe, int *size, unsigned int owner_bit) __attribute__((always_inline)); +static inline void set_data_non_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list, + void *wqe, int *size, unsigned int owner_bit) +{ + if (likely(num_sge == 1)) { + struct mlx4_wqe_data_seg *seg = wqe; + + set_ptr_data(seg, sg_list, owner_bit); + + *size += (sizeof(*seg) / 16); + } else { + struct mlx4_wqe_data_seg *seg = wqe; + int i; + + for (i = num_sge - 1; i >= 0 ; --i) + set_ptr_data(seg + i, sg_list + i, owner_bit); + + *size += num_sge * (sizeof(*seg) / 16); + } +} + +static inline int set_data_seg(struct mlx4_qp *qp, void *seg, int *sz, int is_inl, + int num_sge, struct ibv_sge *sg_list, int *inl, + unsigned int owner_bit) __attribute__((always_inline)); +static inline int set_data_seg(struct mlx4_qp *qp, void *seg, int *sz, int is_inl, + int num_sge, struct ibv_sge *sg_list, int *inl, + unsigned int owner_bit) +{ + if (is_inl) { + /* inl is set to true if this is an inline data segment and num_sge > 0 */ + *inl = num_sge > 0; + return set_data_inl_seg(qp, num_sge, sg_list, seg, sz, + owner_bit); + } + set_data_non_inl_seg(qp, num_sge, sg_list, seg, sz, owner_bit); + + return 0; +} + +static inline int set_common_segments(struct ibv_send_wr *wr, struct mlx4_qp *qp, + uint32_t srcrb_flags, uint32_t imm, + void *wqe, void *ctrl, int size, int *total_size, + int *inl, unsigned int ind) __attribute__((always_inline)); +static inline int set_common_segments(struct ibv_send_wr *wr, struct mlx4_qp *qp, + uint32_t srcrb_flags, uint32_t imm, + void *wqe, void *ctrl, int size, int *total_size, + int *inl, unsigned int ind) +{ + int ret; + unsigned int owner_bit = (ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0; + + ret = set_data_seg(qp, wqe, &size, !!(wr->send_flags & IBV_SEND_INLINE), + wr->num_sge, wr->sg_list, inl, owner_bit); + if (unlikely(ret)) + return ret; + + *total_size = size; + set_ctrl_seg(ctrl, wr, qp, imm, srcrb_flags, owner_bit, size, + mlx4_ib_opcode[wr->opcode]); + + return 0; + +} + +static int post_send_other(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) +{ + void *ctrl = wqe_add; + void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg); + int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16; + int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED | + (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1); + uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]); + uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ? wr->imm_data : 0; + + return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind); + +} + +static int post_send_rc_raw_packet(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) +{ + void *ctrl = wqe_add; + void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg); + union { + uint32_t srcrb_flags; + uint16_t srcrb_flags16[2]; + } u; + uint32_t imm; + int idx; + int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16; + + /* Sanity check - prevent from posting empty SR */ + if (unlikely(!wr->num_sge)) + return EINVAL; + + if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) { + /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used + * to indicate that no icrc should be calculated */ + idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED; + u.srcrb_flags = htonl((uint32_t)(qp->srcrb_flags_tbl[idx] | MLX4_WQE_CTRL_SOLICIT)); + /* For raw eth, take the dmac from the payload */ + u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)wr->sg_list[0].addr; + imm = *(uint32_t *)((uintptr_t)(wr->sg_list[0].addr)+2); + } else { + idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED | + (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1); + u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]); + + imm = (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ? wr->imm_data : 0; + } + + return set_common_segments(wr, qp, u.srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind); +} + +static int post_send_ud(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) +{ + void *ctrl = wqe_add; + void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg); + int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16; + int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED | + (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1); + uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]); + uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ? wr->imm_data : 0; + + set_datagram_seg(wqe, wr); + wqe += sizeof(struct mlx4_wqe_datagram_seg); + size += sizeof(struct mlx4_wqe_datagram_seg) / 16; + + return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind); +} + +static inline int post_send_connected(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind, int is_xrc) __attribute__((always_inline)); +static inline int post_send_connected(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind, int is_xrc) +{ + void *ctrl = wqe_add; + void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg); + uint32_t srcrb_flags; + uint32_t imm = 0; + int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16; + int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED | + (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1); + + if (is_xrc) + srcrb_flags = htonl((wr->qp_type.xrc.remote_srqn << 8) | + (qp->srcrb_flags_tbl[idx])); + else + srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]); + + switch (wr->opcode) { + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof(struct mlx4_wqe_raddr_seg); + + set_atomic_seg(wqe, (struct ibv_exp_send_wr *)wr); + wqe += sizeof(struct mlx4_wqe_atomic_seg); + size += (sizeof(struct mlx4_wqe_raddr_seg) + + sizeof(struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IBV_WR_SEND_WITH_IMM: + imm = wr->imm_data; + break; + + case IBV_WR_RDMA_WRITE_WITH_IMM: + imm = wr->imm_data; + if (!wr->num_sge) + *inl = 1; + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof(struct mlx4_wqe_raddr_seg); + size += sizeof(struct mlx4_wqe_raddr_seg) / 16; + break; + + case IBV_WR_RDMA_READ: + *inl = 1; + /* fall through */ + case IBV_WR_RDMA_WRITE: + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof(struct mlx4_wqe_raddr_seg); + size += sizeof(struct mlx4_wqe_raddr_seg) / 16; + + break; + + case IBV_WR_SEND: + break; + + default: + /* No extra segments required for sends */ + break; + } + + return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind); +} + +static int post_send_rc_uc(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) +{ + return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 0); +} + +static int post_send_xrc(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe_add, int *total_size, + int *inl, unsigned int ind) +{ + return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 1); +} + +void mlx4_update_post_send_one(struct mlx4_qp *qp) +{ + switch (qp->qp_type) { + case IBV_QPT_XRC_SEND: + case IBV_QPT_XRC: + qp->post_send_one = post_send_xrc; + break; + case IBV_QPT_RC: + case IBV_QPT_UC: + qp->post_send_one = post_send_rc_uc; + break; + case IBV_QPT_UD: + qp->post_send_one = post_send_ud; + break; + + case IBV_QPT_RAW_PACKET: + qp->post_send_one = post_send_rc_raw_packet; + break; + + default: + qp->post_send_one = post_send_other; + break; } } int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, - struct ibv_send_wr **bad_wr) + struct ibv_send_wr **bad_wr) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + void *uninitialized_var(ctrl); + unsigned int ind; + int nreq; + int inl = 0; + int ret = 0; + int size = 0; + + mlx4_lock(&qp->sq.lock); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.head; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + /* to be considered whether can throw first check, create_qp_exp with post_send */ + if (!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW)) + if (unlikely(wq_overflow(&qp->sq, nreq, qp))) { + ret = ENOMEM; + errno = ret; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + ret = ENOMEM; + errno = ret; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->opcode >= sizeof(mlx4_ib_opcode) / sizeof(mlx4_ib_opcode[0]))) { + ret = EINVAL; + errno = ret; + *bad_wr = wr; + goto out; + } + + ctrl = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ret = qp->post_send_one(wr, qp, ctrl, &size, &inl, ind); + if (unlikely(ret)) { + inl = 0; + errno = ret; + *bad_wr = wr; + goto out; + } + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + */ + if (likely(wr->next)) +#ifndef MLX4_WQE_FORMAT + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & + (qp->sq.wqe_cnt - 1)); +#else + /* Make sure all owners bits are set to HW ownership */ + set_owner_wqe(qp, ind, size, + ((ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0)); +#endif + + ++ind; + } + +out: + ring_db(qp, ctrl, nreq, size, inl); + + if (likely(nreq)) +#ifndef MLX4_WQE_FORMAT + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & + (qp->sq.wqe_cnt - 1)); +#else + set_owner_wqe(qp, ind - 1, size, + ((ind - 1) & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0)); +#endif + mlx4_unlock(&qp->sq.lock); + + return ret; +} + +int mlx4_exp_post_send(struct ibv_qp *ibqp, struct ibv_exp_send_wr *wr, + struct ibv_exp_send_wr **bad_wr) { - struct mlx4_context *ctx; struct mlx4_qp *qp = to_mqp(ibqp); void *wqe; - struct mlx4_wqe_ctrl_seg *ctrl; - int ind; + void *uninitialized_var(ctrl); + union { + uint32_t srcrb_flags; + uint16_t srcrb_flags16[2]; + } u; + uint32_t imm; + int idx; + unsigned int ind; + int uninitialized_var(owner_bit); int nreq; int inl = 0; int ret = 0; - int size; - int i; + int size = 0; + uint32_t mlx4_wr_op; + uint64_t exp_send_flags; - pthread_spin_lock(&qp->sq.lock); + mlx4_lock(&qp->sq.lock); /* XXX check that state is OK to post send */ ind = qp->sq.head; for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { - ret = -1; + exp_send_flags = wr->exp_send_flags; + + if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW) && + wq_overflow(&qp->sq, nreq, qp))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + ret = ENOMEM; *bad_wr = wr; goto out; } - if (wr->num_sge > qp->sq.max_gs) { - ret = -1; + if (unlikely(wr->exp_opcode >= sizeof(mlx4_ib_opcode_exp) / sizeof(mlx4_ib_opcode_exp[0]))) { + ret = EINVAL; *bad_wr = wr; goto out; } - if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { - ret = -1; + if (((MLX4_IB_OPCODE_GET_CLASS(mlx4_ib_opcode_exp[wr->exp_opcode]) == MLX4_OPCODE_MANAGED) || + (exp_send_flags & IBV_EXP_SEND_WITH_CALC)) && + !(qp->create_flags & IBV_EXP_QP_CREATE_CROSS_CHANNEL)) { + ret = EINVAL; *bad_wr = wr; goto out; } + mlx4_wr_op = MLX4_IB_OPCODE_GET_OP(mlx4_ib_opcode_exp[wr->exp_opcode]); + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + owner_bit = ind & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0; - ctrl->xrcrb_flags = - (wr->send_flags & IBV_SEND_SIGNALED ? - htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | - (wr->send_flags & IBV_SEND_SOLICITED ? - htonl(MLX4_WQE_CTRL_SOLICIT) : 0) | - qp->sq_signal_bits; + idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED | + (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) | + (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2); + u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]); - if (wr->opcode == IBV_WR_SEND_WITH_IMM || - wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) - ctrl->imm = wr->imm_data; - else - ctrl->imm = 0; + imm = (MLX4_IB_OPCODE_GET_ATTR(mlx4_ib_opcode_exp[wr->exp_opcode]) & MLX4_OPCODE_WITH_IMM ? + wr->ex.imm_data : 0); - wqe += sizeof *ctrl; - size = sizeof *ctrl / 16; + wqe += sizeof(struct mlx4_wqe_ctrl_seg); + size = sizeof(struct mlx4_wqe_ctrl_seg) / 16; - switch (ibqp->qp_type) { + switch (qp->qp_type) { + case IBV_QPT_XRC_SEND: case IBV_QPT_XRC: - ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8); - /* fall thru */ + u.srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr); + /* fall through */ case IBV_QPT_RC: case IBV_QPT_UC: - switch (wr->opcode) { - case IBV_WR_ATOMIC_CMP_AND_SWP: - case IBV_WR_ATOMIC_FETCH_AND_ADD: - set_raddr_seg(wqe, wr->wr.atomic.remote_addr, - wr->wr.atomic.rkey); + switch (wr->exp_opcode) { + case IBV_EXP_WR_ATOMIC_CMP_AND_SWP: + case IBV_EXP_WR_ATOMIC_FETCH_AND_ADD: + case IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD: + if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) { + if (!qp->is_masked_atomic) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + set_raddr_seg(wqe, + wr->ext_op.masked_atomics.remote_addr, + wr->ext_op.masked_atomics.rkey); + } else { + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + } wqe += sizeof (struct mlx4_wqe_raddr_seg); set_atomic_seg(wqe, wr); @@ -264,184 +1239,259 @@ break; - case IBV_WR_RDMA_READ: + case IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP: + if (!qp->is_masked_atomic) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + set_raddr_seg(wqe, + wr->ext_op.masked_atomics.remote_addr, + wr->ext_op.masked_atomics.rkey); + wqe += sizeof(struct mlx4_wqe_raddr_seg); + + set_masked_atomic_seg(wqe, wr); + wqe += sizeof(struct mlx4_wqe_masked_atomic_seg); + size += (sizeof(struct mlx4_wqe_raddr_seg) + + sizeof(struct mlx4_wqe_masked_atomic_seg)) / 16; + break; + + case IBV_EXP_WR_RDMA_READ: inl = 1; /* fall through */ - case IBV_WR_RDMA_WRITE: - case IBV_WR_RDMA_WRITE_WITH_IMM: - set_raddr_seg(wqe, wr->wr.rdma.remote_addr, - wr->wr.rdma.rkey); + case IBV_EXP_WR_RDMA_WRITE_WITH_IMM: + if (!wr->num_sge) + inl = 1; + /* fall through */ + case IBV_EXP_WR_RDMA_WRITE: + if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) { + + if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER || + (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER || + (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER || + !mlx4_calc_ops_table + [wr->op.calc.data_size] + [wr->op.calc.calc_op] + [wr->op.calc.data_type].valid) { + ret = -1; + *bad_wr = wr; + goto out; + } + + mlx4_wr_op = MLX4_OPCODE_CALC_RDMA_WRITE_IMM | + mlx4_calc_ops_table + [wr->op.calc.data_size] + [wr->op.calc.calc_op] + [wr->op.calc.data_type].opcode; + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + + } else { + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + } wqe += sizeof (struct mlx4_wqe_raddr_seg); size += sizeof (struct mlx4_wqe_raddr_seg) / 16; break; - default: - /* No extra segments required for sends */ + case IBV_EXP_WR_LOCAL_INV: + u.srcrb_flags |= htonl(MLX4_WQE_CTRL_STRONG_ORDER); + set_local_inv_seg(wqe, wr->ex.invalidate_rkey); + wqe += sizeof + (struct mlx4_wqe_local_inval_seg); + size += sizeof + (struct mlx4_wqe_local_inval_seg) / 16; break; - } - break; - - case IBV_QPT_UD: - set_datagram_seg(wqe, wr); - wqe += sizeof (struct mlx4_wqe_datagram_seg); - size += sizeof (struct mlx4_wqe_datagram_seg) / 16; - if (to_mah(wr->wr.ud.ah)->tagged) { - ctrl->ins_vlan = 1 << 6; - ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan); - } - break; + case IBV_EXP_WR_BIND_MW: + u.srcrb_flags |= htonl(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof + (struct mlx4_wqe_bind_seg); + size += sizeof + (struct mlx4_wqe_bind_seg) / 16; + break; - default: - break; - } + case IBV_EXP_WR_SEND: + if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) { + + if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER || + (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER || + (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER || + !mlx4_calc_ops_table + [wr->op.calc.data_size] + [wr->op.calc.calc_op] + [wr->op.calc.data_type].valid) { + ret = -1; + *bad_wr = wr; + goto out; + } + + mlx4_wr_op = MLX4_OPCODE_CALC_SEND | + mlx4_calc_ops_table + [wr->op.calc.data_size] + [wr->op.calc.calc_op] + [wr->op.calc.data_type].opcode; + } - if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { - struct mlx4_wqe_inline_seg *seg; - void *addr; - int len, seg_len; - int num_seg; - int off, to_copy; + break; - inl = 0; + case IBV_EXP_WR_CQE_WAIT: + { + struct mlx4_cq *wait_cq = to_mcq(wr->task.cqe_wait.cq); + uint32_t wait_index = 0; - seg = wqe; - wqe += sizeof *seg; - off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); - num_seg = 0; - seg_len = 0; + wait_index = wait_cq->wait_index + + wr->task.cqe_wait.cq_count; + wait_cq->wait_count = max(wait_cq->wait_count, + wr->task.cqe_wait.cq_count); - for (i = 0; i < wr->num_sge; ++i) { - addr = (void *) (uintptr_t) wr->sg_list[i].addr; - len = wr->sg_list[i].length; - inl += len; + if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) { + wait_cq->wait_index += wait_cq->wait_count; + wait_cq->wait_count = 0; + } - if (inl > qp->max_inline_data) { - inl = 0; - ret = -1; - *bad_wr = wr; - goto out; + set_wait_en_seg(wqe, wait_cq->cqn, wait_index); + wqe += sizeof(struct mlx4_wqe_wait_en_seg); + size += sizeof(struct mlx4_wqe_wait_en_seg) / 16; } + break; - while (len >= MLX4_INLINE_ALIGN - off) { - to_copy = MLX4_INLINE_ALIGN - off; - memcpy(wqe, addr, to_copy); - len -= to_copy; - wqe += to_copy; - addr += to_copy; - seg_len += to_copy; - wmb(); /* see comment below */ - seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); - seg_len = 0; - seg = wqe; - wqe += sizeof *seg; - off = sizeof *seg; - ++num_seg; + case IBV_EXP_WR_SEND_ENABLE: + case IBV_EXP_WR_RECV_ENABLE: + { + unsigned head_en_index; + struct mlx4_wq *wq; + + /* + * Posting work request for QP that does not support + * SEND/RECV ENABLE makes performance worse. + */ + if (((wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) && + !(to_mqp(wr->task.wqe_enable.qp)->create_flags & + IBV_EXP_QP_CREATE_MANAGED_SEND)) || + ((wr->exp_opcode == IBV_EXP_WR_RECV_ENABLE) && + !(to_mqp(wr->task.wqe_enable.qp)->create_flags & + IBV_EXP_QP_CREATE_MANAGED_RECV))) { + ret = -1; + *bad_wr = wr; + goto out; + } + + wq = (wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) ? + &to_mqp(wr->task.wqe_enable.qp)->sq : + &to_mqp(wr->task.wqe_enable.qp)->rq; + + /* If wqe_count is 0 release all WRs from queue */ + if (wr->task.wqe_enable.wqe_count) { + head_en_index = wq->head_en_index + + wr->task.wqe_enable.wqe_count; + wq->head_en_count = max(wq->head_en_count, + wr->task.wqe_enable.wqe_count); + + if ((int)(wq->head - head_en_index) < 0) { + ret = -1; + *bad_wr = wr; + goto out; + } + } else { + head_en_index = wq->head; + wq->head_en_count = wq->head - wq->head_en_index; + } + + if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) { + wq->head_en_index += wq->head_en_count; + wq->head_en_count = 0; + } + + set_wait_en_seg(wqe, + wr->task.wqe_enable.qp->qp_num, + head_en_index); + + wqe += sizeof(struct mlx4_wqe_wait_en_seg); + size += sizeof(struct mlx4_wqe_wait_en_seg) / 16; } + break; - memcpy(wqe, addr, len); - wqe += len; - seg_len += len; - off += len; - } + case IBV_EXP_WR_SEND_WITH_INV: + imm = htonl(wr->ex.invalidate_rkey); + break; - if (seg_len) { - ++num_seg; - /* - * Need a barrier here to make sure - * all the data is visible before the - * byte_count field is set. Otherwise - * the HCA prefetcher could grab the - * 64-byte chunk with this inline - * segment and get a valid (!= - * 0xffffffff) byte count but stale - * data, and end up sending the wrong - * data. - */ - wmb(); - seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + default: + /* No extra segments required for sends */ + break; } + break; - size += (inl + num_seg * sizeof * seg + 15) / 16; - } else { - struct mlx4_wqe_data_seg *seg = wqe; + case IBV_QPT_UD: + set_datagram_seg(wqe, (struct ibv_send_wr *)wr); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; - for (i = wr->num_sge - 1; i >= 0 ; --i) - set_data_seg(seg + i, wr->sg_list + i); + case IBV_QPT_RAW_PACKET: + /* Sanity check - prevent from posting empty SR */ + if (unlikely(!wr->num_sge)) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) { + /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used + * to indicate that no icrc should be calculated */ + u.srcrb_flags |= htonl(MLX4_WQE_CTRL_SOLICIT); + /* For raw eth, take the dmac from the payload */ + u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)wr->sg_list[0].addr; + imm = *(uint32_t *)((uintptr_t)(wr->sg_list[0].addr)+2); + } + break; - size += wr->num_sge * (sizeof *seg / 16); + default: + break; } - ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? - MLX4_WQE_CTRL_FENCE : 0) | size; - - /* - * Make sure descriptor is fully written before - * setting ownership bit (because HW can start - * executing as soon as we do). - */ - wmb(); - - ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) | - (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0); + ret = set_data_seg(qp, wqe, &size, !!(exp_send_flags & IBV_EXP_SEND_INLINE), + wr->num_sge, wr->sg_list, &inl, owner_bit); + if (unlikely(ret)) { + inl = 0; + *bad_wr = wr; + goto out; + } + set_ctrl_seg(ctrl, (struct ibv_send_wr *)wr, qp, imm, u.srcrb_flags, owner_bit, size, mlx4_wr_op); /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. */ - if (wr->next) + if (likely(wr->next)) +#ifndef MLX4_WQE_FORMAT stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & (qp->sq.wqe_cnt - 1)); - +#else + set_owner_wqe(qp, ind, size, owner_bit); +#endif ++ind; } out: - ctx = to_mctx(ibqp->context); - - if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) { - ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8); - *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; - /* - * Make sure that descriptor is written to memory - * before writing to BlueFlame page. - */ - wmb(); - - ++qp->sq.head; - - pthread_spin_lock(&ctx->bf_lock); - - mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, - align(size * 16, 64)); - wc_wmb(); - - ctx->bf_offset ^= ctx->bf_buf_size; - - pthread_spin_unlock(&ctx->bf_lock); - } else if (nreq) { - qp->sq.head += nreq; - - /* - * Make sure that descriptors are written before - * doorbell record. - */ - wmb(); - - *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn; - } - - if (nreq) + ring_db(qp, ctrl, nreq, size, inl); + if (likely(nreq)) +#ifndef MLX4_WQE_FORMAT stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & (qp->sq.wqe_cnt - 1)); +#else + set_owner_wqe(qp, ind - 1, size, owner_bit); +#endif - pthread_spin_unlock(&qp->sq.lock); + mlx4_unlock(&qp->sq.lock); return ret; } + + int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { @@ -449,24 +1499,25 @@ struct mlx4_wqe_data_seg *scat; int ret = 0; int nreq; - int ind; + unsigned int ind; int i; + struct mlx4_inlr_rbuff *rbuffs; - pthread_spin_lock(&qp->rq.lock); + mlx4_lock(&qp->rq.lock); /* XXX check that state is OK to post receive */ - ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) { - ret = -1; + if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW) && + wq_overflow(&qp->rq, nreq, qp))) { + ret = ENOMEM; *bad_wr = wr; goto out; } - if (wr->num_sge > qp->rq.max_gs) { - ret = -1; + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + ret = EINVAL; *bad_wr = wr; goto out; } @@ -476,11 +1527,20 @@ for (i = 0; i < wr->num_sge; ++i) __set_data_seg(scat + i, wr->sg_list + i); - if (i < qp->rq.max_gs) { + if (likely(i < qp->rq.max_gs)) { scat[i].byte_count = 0; scat[i].lkey = htonl(MLX4_INVALID_LKEY); scat[i].addr = 0; } + if (qp->max_inlr_sg) { + rbuffs = qp->inlr_buff.buff[ind].sg_list; + qp->inlr_buff.buff[ind].list_len = wr->num_sge; + for (i = 0; i < wr->num_sge; ++i) { + rbuffs->rbuff = (void *)(unsigned long)(wr->sg_list[i].addr); + rbuffs->rlen = wr->sg_list[i].length; + rbuffs++; + } + } qp->rq.wrid[ind] = wr->wr_id; @@ -488,7 +1548,7 @@ } out: - if (nreq) { + if (likely(nreq)) { qp->rq.head += nreq; /* @@ -500,7 +1560,7 @@ *qp->db = htonl(qp->rq.head & 0xffff); } - pthread_spin_unlock(&qp->rq.lock); + mlx4_unlock(&qp->rq.lock); return ret; } @@ -533,6 +1593,7 @@ struct mlx4_qp *qp) { int size; + int atomic_size; int max_sq_sge; max_sq_sge = align(cap->max_inline_data + @@ -553,6 +1614,7 @@ size += sizeof (struct mlx4_wqe_raddr_seg); break; + case IBV_QPT_XRC_SEND: case IBV_QPT_XRC: case IBV_QPT_RC: size += sizeof (struct mlx4_wqe_raddr_seg); @@ -560,12 +1622,14 @@ * An atomic op will require an atomic segment, a * remote address segment and one scatter entry. */ - if (size < (sizeof (struct mlx4_wqe_atomic_seg) + - sizeof (struct mlx4_wqe_raddr_seg) + - sizeof (struct mlx4_wqe_data_seg))) - size = (sizeof (struct mlx4_wqe_atomic_seg) + - sizeof (struct mlx4_wqe_raddr_seg) + - sizeof (struct mlx4_wqe_data_seg)); + atomic_size = (qp->is_masked_atomic ? + sizeof(struct mlx4_wqe_masked_atomic_seg) : + sizeof(struct mlx4_wqe_atomic_seg)) + + sizeof(struct mlx4_wqe_raddr_seg) + + sizeof(struct mlx4_wqe_data_seg); + + if (size < atomic_size) + size = atomic_size; break; default: @@ -583,56 +1647,39 @@ ; /* nothing */ } -int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, - enum ibv_qp_type type, struct mlx4_qp *qp) +int mlx4_use_huge(struct ibv_context *context, const char *key) { - qp->rq.max_gs = cap->max_recv_sge; + char e[VERBS_MAX_ENV_VAL]; - qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); - if (!qp->sq.wrid) - return -1; + if (!ibv_exp_cmd_getenv(context, key, e, sizeof(e)) && !strcmp(e, "y")) + return 1; + return 0; +} + +void mlx4_dealloc_qp_buf(struct ibv_context *context, struct mlx4_qp *qp) +{ if (qp->rq.wqe_cnt) { - qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); - if (!qp->rq.wrid) { - free(qp->sq.wrid); - return -1; + free(qp->rq.wrid); + if (qp->max_inlr_sg) { + free(qp->inlr_buff.buff[0].sg_list); + free(qp->inlr_buff.buff); } } - - for (qp->rq.wqe_shift = 4; - 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg); - qp->rq.wqe_shift++) - ; /* nothing */ - - qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + - (qp->sq.wqe_cnt << qp->sq.wqe_shift); - if (qp->rq.wqe_shift > qp->sq.wqe_shift) { - qp->rq.offset = 0; - qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - } else { - qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; - qp->sq.offset = 0; - } - - if (mlx4_alloc_buf(&qp->buf, - align(qp->buf_size, to_mdev(pd->context->device)->page_size), - to_mdev(pd->context->device)->page_size)) { + if (qp->sq.wqe_cnt) free(qp->sq.wrid); - free(qp->rq.wrid); - return -1; - } - memset(qp->buf.buf, 0, qp->buf_size); - - return 0; + if (qp->buf.hmem != NULL) + mlx4_free_buf_huge(to_mctx(context), &qp->buf); + else + mlx4_free_buf(&qp->buf); } void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, enum ibv_qp_type type) { int wqe_size; - struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context); + struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context); wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) - sizeof (struct mlx4_wqe_ctrl_seg); @@ -641,9 +1688,10 @@ wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); break; + case IBV_QPT_XRC_SEND: + case IBV_QPT_XRC: case IBV_QPT_UC: case IBV_QPT_RC: - case IBV_QPT_XRC: wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); break; @@ -704,3 +1752,812 @@ else ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; } + +int mlx4_post_task(struct ibv_context *context, + struct ibv_exp_task *task_list, + struct ibv_exp_task **bad_task) +{ + int rc = 0; + struct ibv_exp_task *cur_task = NULL; + struct ibv_exp_send_wr *bad_wr; + struct mlx4_context *mlx4_ctx = to_mctx(context); + + if (!task_list) + return rc; + + pthread_mutex_lock(&mlx4_ctx->task_mutex); + + cur_task = task_list; + while (!rc && cur_task) { + + switch (cur_task->task_type) { + case IBV_EXP_TASK_SEND: + rc = ibv_exp_post_send(cur_task->item.qp, + cur_task->item.send_wr, + &bad_wr); + break; + + case IBV_EXP_TASK_RECV: + rc = ibv_post_recv(cur_task->item.qp, + cur_task->item.recv_wr, + NULL); + break; + + default: + rc = -1; + } + + if (rc && bad_task) { + *bad_task = cur_task; + break; + } + + cur_task = cur_task->next; + } + + pthread_mutex_unlock(&mlx4_ctx->task_mutex); + + return rc; +} + +/* + * family interfaces functions + */ + +/* + * send_pending - is a general post send function that put one message in + * the send queue. The function is not ringing the QP door-bell. + * + * User may call this function several times to fill send queue with + * several messages, then he can call mlx4_send_flush to ring the QP DB + * + * This function is used to implement the following QP burst family functions: + * - send_pending + * - send_pending_inline + * - send_pending_sg_list + * - send_burst + */ +static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr, + uint32_t length, uint32_t lkey, + uint32_t flags, + const int use_raw_eth, const int use_inl, + const int thread_safe, const int wqe_64, + const int use_sg_list, int num_sge, + struct ibv_sge *sg_list, + const int lb) __attribute__((always_inline)); +static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr, + uint32_t length, uint32_t lkey, + uint32_t flags, + const int use_raw_eth, const int use_inl, + const int thread_safe, const int wqe_64, + const int use_sg_list, int num_sge, + struct ibv_sge *sg_list, + const int lb) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_data_seg *dseg; + uint32_t tunnel_offload = 0; + unsigned int owner_bit = qp->sq.head & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0; + int size; + int idx; + int i; + + if (thread_safe) + mlx4_lock(&qp->sq.lock); + + if (wqe_64) + ctrl = get_send_wqe64(qp, qp->sq.head & (qp->sq.wqe_cnt - 1)); + else + ctrl = get_send_wqe(qp, qp->sq.head & (qp->sq.wqe_cnt - 1)); + + dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) + sizeof(struct mlx4_wqe_ctrl_seg)); + + if (use_sg_list) { + for (i = num_sge - 1; i >= 0 ; --i) + set_ptr_data(dseg + i, sg_list + i, owner_bit); + + size = (sizeof(struct mlx4_wqe_ctrl_seg) + (num_sge * sizeof(struct mlx4_wqe_data_seg)))/ 16; + } else { + if (use_inl) { + size = sizeof(struct mlx4_wqe_ctrl_seg) / 16; + set_data_inl_seg_fast(qp, (void *)(uintptr_t)addr, length, dseg, &size, owner_bit); + } else { + size = (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))/ 16; + dseg->byte_count = SET_BYTE_COUNT(length); + dseg->lkey = htonl(lkey); + dseg->addr = htonll(addr); + } + } + + if (use_raw_eth) { + /* For raw eth, the SOLICIT flag is used + * to indicate that no icrc should be calculated */ + idx = IBV_EXP_QP_BURST_SOLICITED | + (flags & (IBV_EXP_QP_BURST_SIGNALED | + IBV_EXP_QP_BURST_IP_CSUM | + IBV_EXP_QP_BURST_TUNNEL)); + tunnel_offload = flags & IBV_EXP_QP_BURST_TUNNEL ? MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_IL4 : 0; + } else { + idx = (flags & (IBV_EXP_QP_BURST_SIGNALED | + IBV_EXP_QP_BURST_SOLICITED | + IBV_EXP_QP_BURST_IP_CSUM)); + } + + if (use_raw_eth && lb) { + union { + uint32_t srcrb_flags; + uint16_t srcrb_flags16[2]; + } u; + + u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]); + /* For raw eth, take the dmac from the payload */ + if (use_sg_list) + addr = sg_list[0].addr; + u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)addr; + ctrl->srcrb_flags = u.srcrb_flags; + ctrl->imm = *(uint32_t *)((uintptr_t)(addr)+2); + } else { + ctrl->srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]); + ctrl->imm = 0; + } + ctrl->fence_size = (flags & IBV_EXP_QP_BURST_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + wmb(); + + ctrl->owner_opcode = htonl(MLX4_OPCODE_SEND | tunnel_offload) | owner_bit; + qp->sq.head++; + + if (!wqe_64) +#ifndef MLX4_WQE_FORMAT + stamp_send_wqe(qp, (qp->sq.head + qp->sq_spare_wqes) & + (qp->sq.wqe_cnt - 1)); +#else + set_owner_wqe(qp, qp->sq.head, size, owner_bit); +#endif + if (thread_safe) + mlx4_unlock(&qp->sq.lock); + else + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + return 0; +} + +/* burst family - send_pending */ +static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr, + uint32_t length, uint32_t lkey, + uint32_t flags, const int lb) __attribute__((always_inline)); +static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr, + uint32_t length, uint32_t lkey, + uint32_t flags, const int lb) +{ + struct mlx4_qp *mqp = to_mqp(qp); + int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && + mqp->link_layer == IBV_LINK_LAYER_ETHERNET; + int wqe_64 = mqp->sq.wqe_shift == 6; + + /* qp, addr, length, lkey, flags, raw_eth, inl, safe, */ + return send_pending(qp, addr, length, lkey, flags, raw_eth, 0, 1, + /* wqe_64, use_sg, num_sge, sg_list, lb */ + wqe_64, 0, 0, NULL, lb); +} + +static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) +{ + return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 1); +} + +static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) +{ + return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 0); +} + +#define MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_unsafe_##eth##wqe64##lb +#define MLX4_SEND_PENDING_UNSAFE(eth, wqe64, lb) \ + static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)( \ + struct ibv_qp *qp, uint64_t addr, \ + uint32_t length, uint32_t lkey, \ + uint32_t flags) __MLX4_ALGN_FUNC__; \ + static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)( \ + struct ibv_qp *qp, uint64_t addr, \ + uint32_t length, uint32_t lkey, \ + uint32_t flags) \ + { \ + /* qp, addr, length, lkey, flags, eth, inl, */ \ + return send_pending(qp, addr, length, lkey, flags, eth, 0, \ + /* safe, wqe_64, use_sg, num_sge, sg_list */ \ + 0, wqe64, 0, 0, NULL, \ + /* lb */ \ + lb); \ + } +/* eth, wqe64, lb */ +MLX4_SEND_PENDING_UNSAFE(0, 0, 0); +MLX4_SEND_PENDING_UNSAFE(0, 0, 1); +MLX4_SEND_PENDING_UNSAFE(0, 1, 0); +MLX4_SEND_PENDING_UNSAFE(0, 1, 1); +MLX4_SEND_PENDING_UNSAFE(1, 0, 0); +MLX4_SEND_PENDING_UNSAFE(1, 0, 1); +MLX4_SEND_PENDING_UNSAFE(1, 1, 0); +MLX4_SEND_PENDING_UNSAFE(1, 1, 1); + +/* burst family - send_pending_inline */ +static inline int mlx4_send_pending_inl_safe(struct ibv_qp *qp, void *addr, + uint32_t length, uint32_t flags, + const int lb) __attribute__((always_inline)); +static inline int mlx4_send_pending_inl_safe(struct ibv_qp *qp, void *addr, + uint32_t length, uint32_t flags, + const int lb) +{ + struct mlx4_qp *mqp = to_mqp(qp); + int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET; + int wqe_64 = mqp->sq.wqe_shift == 6; + + /* qp, addr, length, lkey, flags, raw_eth, */ + return send_pending(qp, (uintptr_t)addr, length, 0, flags, raw_eth, + /* inl, safe, wqe_64, use_sg, num_sge, sg_list, lb */ + 1, 1, wqe_64, 0, 0, NULL, lb); +} + +static int mlx4_send_pending_inl_safe_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_pending_inl_safe_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) +{ + return mlx4_send_pending_inl_safe(qp, addr, length, flags, 1); +} + +static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) +{ + return mlx4_send_pending_inl_safe(qp, addr, length, flags, 0); +} + +#define MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_inl_unsafe_##eth##wqe64##lb +#define MLX4_SEND_PENDING_INL_UNSAFE(eth, wqe64, lb) \ + static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)( \ + struct ibv_qp *qp, void *addr, \ + uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__; \ + static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)( \ + struct ibv_qp *qp, void *addr, \ + uint32_t length, uint32_t flags) \ + { \ + /* qp, addr, length, lkey, flags, eth, inl, */ \ + return send_pending(qp, (uintptr_t)addr, length, 0, flags, eth, 1, \ + /* safe, wqe_64, use_sg, num_sge, sg_list, lb */ \ + 0, wqe64, 0, 0, NULL, lb); \ + } +/* eth, wqe64, lb */ +MLX4_SEND_PENDING_INL_UNSAFE(0, 0, 0); +MLX4_SEND_PENDING_INL_UNSAFE(0, 0, 1); +MLX4_SEND_PENDING_INL_UNSAFE(0, 1, 0); +MLX4_SEND_PENDING_INL_UNSAFE(0, 1, 1); +MLX4_SEND_PENDING_INL_UNSAFE(1, 0, 0); +MLX4_SEND_PENDING_INL_UNSAFE(1, 0, 1); +MLX4_SEND_PENDING_INL_UNSAFE(1, 1, 0); +MLX4_SEND_PENDING_INL_UNSAFE(1, 1, 1); + +/* burst family - send_pending_sg_list */ +static inline int mlx4_send_pending_sg_list_safe( + struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + uint32_t flags, const int lb) __attribute__((always_inline)); +static inline int mlx4_send_pending_sg_list_safe( + struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + uint32_t flags, const int lb) +{ + struct mlx4_qp *mqp = to_mqp(ibqp); + int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET; + int wqe_64 = mqp->sq.wqe_shift == 6; + + /* qp, addr, length, lkey, flags, raw_eth, inl, */ + return send_pending(ibqp, 0, 0, 0, flags, raw_eth, 0, + /* safe, wqe_64, use_sg, num_sge, sg_list, lb */ + 1, wqe_64, 1, num, sg_list, lb); +} +static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) +{ + return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 1); +} + +static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) +{ + return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 0); +} + +#define MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_sg_list_unsafe_##eth##wqe64##lb +#define MLX4_SEND_PENDING_SG_LIST_UNSAFE(eth, wqe64, lb) \ + static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \ + static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num, uint32_t flags) \ + { \ + /* qp, addr, length, lkey, flags, eth, inl, */ \ + return send_pending(ibqp, 0, 0, 0, flags, eth, 0, \ + /* safe, wqe_64, use_sg, num_sge, sg_list, lb */ \ + 0, wqe64, 1, num, sg_list, lb); \ + } +/* eth, wqe64, lb */ +MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 0, 0); +MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 0, 1); +MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 1, 0); +MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 1, 1); +MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 0, 0); +MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 0, 1); +MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 1, 0); +MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 1, 1); + +static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64) __attribute__((always_inline)); +/* burst family - send_burst */ +static inline int send_msg_list(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + uint32_t flags, const int raw_eth, const int thread_safe, + const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb) __attribute__((always_inline)); +static inline int send_msg_list(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + uint32_t flags, const int raw_eth, const int thread_safe, + const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + int i; + + if (unlikely(thread_safe)) + mlx4_lock(&qp->sq.lock); + + for (i = 0; i < num; i++, sg_list++) + /* qp, addr, length, lkey, */ + send_pending(ibqp, sg_list->addr, sg_list->length, sg_list->lkey, + /* flags, raw_eth, inl, safe, wqe_64, use_sg, */ + flags, raw_eth, 0, 0, wqe_64, 0, + /* num_sge, sg_list, lb */ + 0, NULL, lb); + + if (use_bf) + /* use send_flush_unsafe since lock is already taken if needed */ + send_flush_unsafe(ibqp, _1thrd_evict, wqe_64); + else + *qp->sdb = qp->doorbell_qpn; + + if (unlikely(thread_safe)) + mlx4_unlock(&qp->sq.lock); + + return 0; +} + +static inline int mlx4_send_burst_safe( + struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + uint32_t flags, const int lb) __attribute__((always_inline)); +static inline int mlx4_send_burst_safe( + struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + uint32_t flags, const int lb) +{ + struct mlx4_qp *mqp = to_mqp(ibqp); + int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET; + int wqe_64 = mqp->sq.wqe_shift == 6; + int _1thrd_evict = mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB || + mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB; + int use_bf = mqp->db_method != MLX4_QP_DB_METHOD_DB; + + return send_msg_list(ibqp, sg_list, num, flags, raw_eth, 1, wqe_64, use_bf, _1thrd_evict, lb); +} + +static int mlx4_send_burst_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_burst_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) +{ + return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 1); +} + +static int mlx4_send_burst_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; +static int mlx4_send_burst_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) +{ + return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 0); +} + +#define MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb) mlx4_send_burst_unsafe_##_1thrd_evict##eth##wqe64##lb +#define MLX4_SEND_BURST_UNSAFE(_1thrd_evict, eth, wqe64, lb) \ + static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \ + static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num, uint32_t flags) \ + { \ + return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 1, _1thrd_evict, \ + lb); \ + } +/* _1thrd_evict, eth, wqe64, lb */ +MLX4_SEND_BURST_UNSAFE(0, 0, 0, 0); +MLX4_SEND_BURST_UNSAFE(0, 0, 0, 1); +MLX4_SEND_BURST_UNSAFE(0, 0, 1, 0); +MLX4_SEND_BURST_UNSAFE(0, 0, 1, 1); +MLX4_SEND_BURST_UNSAFE(0, 1, 0, 0); +MLX4_SEND_BURST_UNSAFE(0, 1, 0, 1); +MLX4_SEND_BURST_UNSAFE(0, 1, 1, 0); +MLX4_SEND_BURST_UNSAFE(0, 1, 1, 1); +MLX4_SEND_BURST_UNSAFE(1, 0, 0, 0); +MLX4_SEND_BURST_UNSAFE(1, 0, 0, 1); +MLX4_SEND_BURST_UNSAFE(1, 0, 1, 0); +MLX4_SEND_BURST_UNSAFE(1, 0, 1, 1); +MLX4_SEND_BURST_UNSAFE(1, 1, 0, 0); +MLX4_SEND_BURST_UNSAFE(1, 1, 0, 1); +MLX4_SEND_BURST_UNSAFE(1, 1, 1, 0); +MLX4_SEND_BURST_UNSAFE(1, 1, 1, 1); + +#define MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb) mlx4_send_burst_unsafe_##eth##wqe64##lb +#define MLX4_SEND_BURST_UNSAFE_DB(eth, wqe64, lb) \ + static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \ + static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num, uint32_t flags) \ + { \ + return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 0, 0, lb); \ + } +/* eth, wqe64, lb */ +MLX4_SEND_BURST_UNSAFE_DB(0, 0, 0); +MLX4_SEND_BURST_UNSAFE_DB(0, 0, 1); +MLX4_SEND_BURST_UNSAFE_DB(0, 1, 0); +MLX4_SEND_BURST_UNSAFE_DB(0, 1, 1); +MLX4_SEND_BURST_UNSAFE_DB(1, 0, 0); +MLX4_SEND_BURST_UNSAFE_DB(1, 0, 1); +MLX4_SEND_BURST_UNSAFE_DB(1, 1, 0); +MLX4_SEND_BURST_UNSAFE_DB(1, 1, 1); + +/* burst family - send_flush */ +static int mlx4_send_flush_db(struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__; +static int mlx4_send_flush_db(struct ibv_qp *ibqp) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + + *qp->sdb = qp->doorbell_qpn; + + return 0; +} + +static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + + if (qp->last_db_head + 1 == qp->sq.head) { + struct mlx4_wqe_ctrl_seg *ctrl = get_send_wqe(qp, qp->last_db_head & (qp->sq.wqe_cnt - 1)); + int size = ctrl->fence_size & 0x3f; + + /* + * There is no need to check that size > 1 since we get here only + * after using send_pending function, this guarantee that size > 1 + */ + if (wqe64) + copy_wqe_to_bf(qp, ctrl, 64, qp->last_db_head, + 1, _1thrd_evict); + else if (size <= qp->bf_buf_size / 16) + copy_wqe_to_bf(qp, ctrl, align(size * 16, 64), + qp->last_db_head, + 1, _1thrd_evict); + else + *qp->sdb = qp->doorbell_qpn; + } else { + *qp->sdb = qp->doorbell_qpn; + } + qp->last_db_head = qp->sq.head; + + return 0; +} + +#define MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64) mlx4_send_flush_unsafe_##_1thrd_evict##wqe64 +#define MLX4_SEND_FLUSH_UNSAFE(_1thrd_evict, wqe64) \ + static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)( \ + struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__; \ + static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)( \ + struct ibv_qp *ibqp) \ + { \ + return send_flush_unsafe(ibqp, _1thrd_evict, wqe64); \ + } + +/* _1thrd_evict, wqe64 */ +MLX4_SEND_FLUSH_UNSAFE(0, 0); +MLX4_SEND_FLUSH_UNSAFE(1, 0); +MLX4_SEND_FLUSH_UNSAFE(0, 1); +MLX4_SEND_FLUSH_UNSAFE(1, 1); + +/* burst family - recv_burst */ +static inline int recv_burst(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + const int thread_safe, const int use_inlne_recv, const int max_one_sge) __attribute__((always_inline)); +static inline int recv_burst(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, + const int thread_safe, const int use_inlne_recv, const int max_one_sge) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + struct mlx4_inlr_rbuff *rbuffs; + unsigned int ind; + int i; + + if (thread_safe) + mlx4_lock(&qp->rq.lock); + + for (i = 0; i < num; ++i) { + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + scat = get_recv_wqe(qp, ind); + __set_data_seg(scat, sg_list); + + if (!max_one_sge) { + scat[1].byte_count = 0; + scat[1].lkey = htonl(MLX4_INVALID_LKEY); + scat[1].addr = 0; + } + + if (use_inlne_recv) { + rbuffs = qp->inlr_buff.buff[ind].sg_list; + qp->inlr_buff.buff[ind].list_len = 1; + rbuffs->rbuff = (void *)(unsigned long)(sg_list->addr); + rbuffs->rlen = sg_list->length; + rbuffs++; + } + sg_list++; + qp->rq.head++; + } + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db = htonl(qp->rq.head & 0xffff); + + if (thread_safe) + mlx4_unlock(&qp->rq.lock); + + return 0; +} + +static int mlx4_recv_burst_safe(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num) __MLX4_ALGN_FUNC__; +static int mlx4_recv_burst_safe(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + + return recv_burst(ibqp, sg_list, num, 1, qp->max_inlr_sg, qp->rq.max_gs == 1); +} +#define MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge) mlx4_recv_burst_unsafe_##inlr##_1sge +#define MLX4_RECV_BURST_UNSAFE(inlr, _1sge) \ + static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num) __MLX4_ALGN_FUNC__; \ + static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)( \ + struct ibv_qp *ibqp, struct ibv_sge *sg_list, \ + uint32_t num) \ + { \ + return recv_burst(ibqp, sg_list, num, 0, inlr, _1sge); \ + } +/* inlr, _1sge */ +MLX4_RECV_BURST_UNSAFE(0, 0); +MLX4_RECV_BURST_UNSAFE(1, 0); +MLX4_RECV_BURST_UNSAFE(0, 1); +MLX4_RECV_BURST_UNSAFE(1, 1); + +/* + * qp_burst family implementation for safe QP + */ +struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_lb = { + .send_burst = mlx4_send_burst_safe_lb, + .send_pending = mlx4_send_pending_safe_lb, + .send_pending_inline = mlx4_send_pending_inl_safe_lb, + .send_pending_sg_list = mlx4_send_pending_sg_list_safe_lb, + .recv_burst = mlx4_recv_burst_safe, + .send_flush = mlx4_send_flush_db +}; + +struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_no_lb = { + .send_burst = mlx4_send_burst_safe_no_lb, + .send_pending = mlx4_send_pending_safe_no_lb, + .send_pending_inline = mlx4_send_pending_inl_safe_no_lb, + .send_pending_sg_list = mlx4_send_pending_sg_list_safe_no_lb, + .recv_burst = mlx4_recv_burst_safe, + .send_flush = mlx4_send_flush_db +}; + +/* + * qp_burst family implementation table for unsafe QP + */ +#define MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge) \ + (lb << 5 | _1thrd_evict << 4 | eth << 3 | wqe64 << 2 | inlr << 1 | _1sge) + +#define MLX4_QP_BURST_UNSAFE_TBL_ENTRY(lb, _1thrd_evict, eth, wqe64, inlr, _1sge) \ + [MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)] = { \ + .send_burst = MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb), \ + .send_pending = MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb), \ + .send_pending_inline = MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb), \ + .send_pending_sg_list = MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb), \ + .recv_burst = MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge), \ + .send_flush = MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64), \ + } +static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_tbl[1 << 6] = { + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 1), +}; + +#define MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge) \ + (lb << 4 | eth << 3 | wqe64 << 2 | inlr << 1 | _1sge) + +#define MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(lb, eth, wqe64, inlr, _1sge) \ + [MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)] = { \ + .send_burst = MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb), \ + .send_pending = MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb), \ + .send_pending_inline = MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb), \ + .send_pending_sg_list = MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb), \ + .recv_burst = MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge), \ + .send_flush = mlx4_send_flush_db, \ + } +static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_db_tbl[1 << 5] = { + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 1), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 0), + MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 1), +}; + +struct ibv_exp_qp_burst_family *mlx4_get_qp_burst_family(struct mlx4_qp *qp, + struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status) +{ + enum ibv_exp_query_intf_status ret = IBV_EXP_INTF_STAT_OK; + struct ibv_exp_qp_burst_family *family = NULL; + uint32_t unsupported_f; + + if ((qp->verbs_qp.qp.state < IBV_QPS_INIT) || (qp->verbs_qp.qp.state > IBV_QPS_RTS)) { + *status = IBV_EXP_INTF_STAT_INVAL_OBJ_STATE; + return NULL; + } + + if (params->flags) { + fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for QP family\n", params->flags); + *status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED; + + return NULL; + } + unsupported_f = params->family_flags & ~(IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK | + IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR); + if (unsupported_f) { + fprintf(stderr, PFX "Family flags(0x%x) are not supported for QP family\n", unsupported_f); + *status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED; + + return NULL; + } + + switch (qp->qp_type) { + case IBV_QPT_RC: + case IBV_QPT_UC: + case IBV_QPT_RAW_PACKET: + if (qp->model_flags & MLX4_QP_MODEL_FLAG_THREAD_SAFE) { + int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK); + + if (lb) + family = &mlx4_qp_burst_family_safe_lb; + else + family = &mlx4_qp_burst_family_safe_no_lb; + } else { + int eth = qp->qp_type == IBV_QPT_RAW_PACKET && + qp->link_layer == IBV_LINK_LAYER_ETHERNET; + int wqe64 = qp->sq.wqe_shift == 6; + int inlr = qp->max_inlr_sg != 0; + int _1sge = qp->rq.max_gs == 1; + int _1thrd_evict = qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB || + qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB; + int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK); + + if (qp->db_method == MLX4_QP_DB_METHOD_DB) + family = &mlx4_qp_burst_family_unsafe_db_tbl + [MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)]; + else + family = &mlx4_qp_burst_family_unsafe_tbl + [MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)]; + } + break; + + default: + ret = IBV_EXP_INTF_STAT_INVAL_PARARM; + break; + } + + *status = ret; + + return family; +} Index: contrib/ofed/libmlx4/src/srq.c =================================================================== --- contrib/ofed/libmlx4/src/srq.c +++ contrib/ofed/libmlx4/src/srq.c @@ -42,6 +42,7 @@ #include "mlx4.h" #include "doorbell.h" #include "wqe.h" +#include "mlx4-abi.h" static void *get_wqe(struct mlx4_srq *srq, int n) { @@ -52,38 +53,43 @@ { struct mlx4_wqe_srq_next_seg *next; - pthread_spin_lock(&srq->lock); + mlx4_spin_lock(&srq->lock); next = get_wqe(srq, srq->tail); next->next_wqe_index = htons(ind); srq->tail = ind; - pthread_spin_unlock(&srq->lock); + mlx4_spin_unlock(&srq->lock); } int mlx4_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { - struct mlx4_srq *srq = to_msrq(ibsrq); + struct mlx4_srq *srq; struct mlx4_wqe_srq_next_seg *next; struct mlx4_wqe_data_seg *scat; int err = 0; int nreq; int i; - pthread_spin_lock(&srq->lock); + if (ibsrq->handle == LEGACY_XRC_SRQ_HANDLE) + ibsrq = (struct ibv_srq *)(((struct ibv_srq_legacy *) ibsrq)->ibv_srq); + srq = to_msrq(ibsrq); + mlx4_spin_lock(&srq->lock); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (wr->num_sge > srq->max_gs) { - err = -1; + errno = EINVAL; + err = errno; *bad_wr = wr; break; } if (srq->head == srq->tail) { /* SRQ is full*/ - err = -1; + errno = ENOMEM; + err = errno; *bad_wr = wr; break; } @@ -119,7 +125,7 @@ *srq->db = htonl(srq->counter); } - pthread_spin_unlock(&srq->lock); + mlx4_spin_unlock(&srq->lock); return err; } @@ -174,52 +180,153 @@ return 0; } -struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn) +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size) { - int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift; + memset(xsrq_table, 0, sizeof *xsrq_table); + xsrq_table->num_xsrq = size; + xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS; + xsrq_table->mask = (1 << xsrq_table->shift) - 1; - if (ctx->xrc_srq_table[tind].refcnt) - return ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask]; - else - return NULL; + pthread_mutex_init(&xsrq_table->mutex, NULL); } -int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn, - struct mlx4_srq *srq) +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) { - int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift; - int ret = 0; + int index; - pthread_mutex_lock(&ctx->xrc_srq_table_mutex); + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + if (xsrq_table->xsrq_table[index].refcnt) + return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask]; + + return NULL; +} - if (!ctx->xrc_srq_table[tind].refcnt) { - ctx->xrc_srq_table[tind].table = calloc(ctx->xrc_srq_table_mask + 1, - sizeof(struct mlx4_srq *)); - if (!ctx->xrc_srq_table[tind].table) { +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq) +{ + int index, ret = 0; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); + if (!xsrq_table->xsrq_table[index].refcnt) { + xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1, + sizeof(struct mlx4_srq *)); + if (!xsrq_table->xsrq_table[index].table) { ret = -1; goto out; } } - ++ctx->xrc_srq_table[tind].refcnt; - ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = srq; + xsrq_table->xsrq_table[index].refcnt++; + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq; out: - pthread_mutex_unlock(&ctx->xrc_srq_table_mutex); + pthread_mutex_unlock(&xsrq_table->mutex); return ret; } -void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn) +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) { - int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift; + int index; - pthread_mutex_lock(&ctx->xrc_srq_table_mutex); + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); - if (!--ctx->xrc_srq_table[tind].refcnt) - free(ctx->xrc_srq_table[tind].table); + if (--xsrq_table->xsrq_table[index].refcnt) + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL; else - ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = NULL; + free(xsrq_table->xsrq_table[index].table); + + pthread_mutex_unlock(&xsrq_table->mutex); +} + +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex) +{ + struct mlx4_create_xsrq cmd; + struct mlx4_create_srq_resp resp; + struct mlx4_srq *srq; + int ret; - pthread_mutex_unlock(&ctx->xrc_srq_table_mutex); + /* Sanity check SRQ size before proceeding */ + if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64) + return NULL; + + srq = calloc(1, sizeof *srq); + if (!srq) + return NULL; + + if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded)) + goto err; + + srq->max = align_queue_size(attr_ex->attr.max_wr + 1); + srq->max_gs = attr_ex->attr.max_sge; + srq->counter = 0; + srq->ext_srq = 1; + + if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq)) + goto err; + + srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); + if (!srq->db) + goto err_free; + + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + + ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, sizeof(srq->verbs_srq), + attr_ex, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table, + srq->verbs_srq.srq_num, srq); + if (ret) + goto err_destroy; + + return &srq->verbs_srq.srq; + +err_destroy: + ibv_cmd_destroy_srq(&srq->verbs_srq.srq); +err_db: + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db); +err_free: + free(srq->wrid); + mlx4_free_buf(&srq->buf); +err: + free(srq); + return NULL; } +int mlx4_destroy_xrc_srq(struct ibv_srq *srq) +{ + struct mlx4_context *mctx = to_mctx(srq->context); + struct mlx4_srq *msrq = to_msrq(srq); + struct mlx4_cq *mcq; + int ret; + + mcq = to_mcq(msrq->verbs_srq.cq); + mlx4_cq_clean(mcq, 0, msrq); + mlx4_lock(&mcq->lock); + mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num); + mlx4_unlock(&mcq->lock); + + ret = ibv_cmd_destroy_srq(srq); + if (ret) { + mlx4_lock(&mcq->lock); + mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq); + mlx4_unlock(&mcq->lock); + return ret; + } + + mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db); + mlx4_free_buf(&msrq->buf); + free(msrq->wrid); + free(msrq); + + return 0; +} Index: contrib/ofed/libmlx4/src/verbs.c =================================================================== --- contrib/ofed/libmlx4/src/verbs.c +++ contrib/ofed/libmlx4/src/verbs.c @@ -40,38 +40,130 @@ #include #include #include - +#include +#include +#include +#include +/* Added for reg_mr mmap munmap system calls */ +#include +#include +#include +#include #include "mlx4.h" #include "mlx4-abi.h" +#include "mlx4_exp.h" #include "wqe.h" +#define SHARED_MR_PROC_DIR_NAME "/proc/driver/mlx4_ib/mrs" +#define FPATH_MAX 128 + +int __mlx4_query_device(uint64_t raw_fw_ver, + struct ibv_device_attr *attr) +{ + unsigned major, minor, sub_minor; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%03d", major, minor, sub_minor); + + return 0; +} + int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr) { struct ibv_query_device cmd; uint64_t raw_fw_ver; - unsigned major, minor, sub_minor; int ret; - ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); + read_init_vars(to_mctx(context)); + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, + sizeof(cmd)); if (ret) return ret; - major = (raw_fw_ver >> 32) & 0xffff; - minor = (raw_fw_ver >> 16) & 0xffff; - sub_minor = raw_fw_ver & 0xffff; + return __mlx4_query_device(raw_fw_ver, attr); +} - snprintf(attr->fw_ver, sizeof attr->fw_ver, - "%d.%d.%03d", major, minor, sub_minor); +#define READL(ptr) (*((uint32_t *)(ptr))) + +static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles) +{ + unsigned int clockhi, clocklo, clockhi1; + int i; + struct mlx4_context *ctx = to_mctx(context); + + if (ctx->hca_core_clock == NULL) + return -EOPNOTSUPP; + + for (i = 0; i < 10; i++) { + clockhi = ntohl(READL(ctx->hca_core_clock)); + clocklo = ntohl(READL(ctx->hca_core_clock + 4)); + clockhi1 = ntohl(READL(ctx->hca_core_clock)); + if (clockhi == clockhi1) + break; + } + + if (clocklo == 0) + clockhi++; + + *cycles = (uint64_t) clockhi << 32 | (uint64_t) clocklo; return 0; } +int mlx4_query_values(struct ibv_context *context, int q_values, + struct ibv_exp_values *values) +{ + struct mlx4_context *ctx = to_mctx(context); + uint64_t cycles; + int err; + uint32_t comp_mask = values->comp_mask; + + values->comp_mask = 0; + + if (q_values & (IBV_EXP_VALUES_HW_CLOCK | IBV_EXP_VALUES_HW_CLOCK_NS)) { + err = mlx4_read_clock(context, &cycles); + if (!err) { + if (comp_mask & IBV_EXP_VALUES_HW_CLOCK) { + values->hwclock = cycles; + values->comp_mask |= IBV_EXP_VALUES_HW_CLOCK; + } + if (q_values & IBV_EXP_VALUES_HW_CLOCK_NS) { + if (comp_mask & IBV_EXP_VALUES_HW_CLOCK_NS) { + values->hwclock_ns = + ((uint64_t)values->hwclock * + ctx->core_clk.mult) + >> ctx->core_clk.shift; + values->comp_mask |= IBV_EXP_VALUES_HW_CLOCK_NS; + } + } + } + } + return 0; +} int mlx4_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) { struct ibv_query_port cmd; + int err; + + read_init_vars(to_mctx(context)); + err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); + if (!err && port <= MLX4_PORTS_NUM && port > 0) { + struct mlx4_context *mctx = to_mctx(context); + if (!mctx->port_query_cache[port - 1].valid) { + mctx->port_query_cache[port - 1].link_layer = + attr->link_layer; + mctx->port_query_cache[port - 1].caps = + attr->port_cap_flags; + mctx->port_query_cache[port - 1].valid = 1; + } + } - return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); + return err; } struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context) @@ -80,6 +172,7 @@ struct mlx4_alloc_pd_resp resp; struct mlx4_pd *pd; + read_init_vars(to_mctx(context)); pd = malloc(sizeof *pd); if (!pd) return NULL; @@ -107,50 +200,570 @@ return 0; } -struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, - enum ibv_access_flags access) + +static void mlx4_free_mr(struct mlx4_mr *mlx4_mr) +{ + /* mr address was allocated in speical mode - freed accordingly */ + if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR || + mlx4_mr->shared_mr) + mlx4_free_buf(&(mlx4_mr->buf)); + + /* Finally we free the structure itself */ + free(mlx4_mr); +} + + +static void *mlx4_get_contiguous_alloc_fallback(struct mlx4_buf *buf, + struct ibv_pd *pd, size_t length) +{ + + /* We allocate as fallback mode non contiguous pages*/ + if (mlx4_alloc_buf( + buf, + align(length, to_mdev(pd->context->device)->page_size), + to_mdev(pd->context->device)->page_size)) + return NULL; + + return buf->buf; +} + + +/* We'll call mmap on mlx4_ib module to achieve this task */ +static void *mlx4_get_contiguous_alloc(struct mlx4_buf *mlx4_buf, + struct ibv_pd *pd, + size_t length, + void *contig_addr) +{ + size_t alloc_length; + int page_size; + int mr_no_allocator = 0; + int mr_force_contig_pages = 0; + enum mlx4_alloc_type alloc_type; + + mlx4_get_alloc_type(pd->context, MLX4_MR_PREFIX, &alloc_type, + MLX4_ALLOC_TYPE_ALL); + + if (alloc_type == MLX4_ALLOC_TYPE_CONTIG) + mr_force_contig_pages = 1; + else if (alloc_type == MLX4_ALLOC_TYPE_ANON) + mr_no_allocator = 1; + + /* For benchmarking purposes we apply an option to turn off continuous + allocator based on environment variable + */ + if (mr_no_allocator) + return mlx4_get_contiguous_alloc_fallback(mlx4_buf, pd, + length); + + page_size = to_mdev(pd->context->device)->page_size; + alloc_length = (contig_addr ? length : align(length, page_size)); + if (!(mlx4_alloc_buf_contig(to_mctx(pd->context), + mlx4_buf, alloc_length, + page_size, MLX4_MR_PREFIX, contig_addr))) + return contig_addr ? contig_addr : mlx4_buf->buf; + + if (mr_force_contig_pages || contig_addr) + return NULL; + + return mlx4_get_contiguous_alloc_fallback(mlx4_buf, + pd, length); + +} + +static int mlx4_get_shared_mr_name(char *in_pattern, char *file_name) +{ + glob_t results; + int ret; + + ret = glob(in_pattern, 0, NULL, &results); + + if (ret) { + if (mlx4_trace) + /* might be some legacy kernel with old mode */ + fprintf(stderr, "mlx4_get_shared_mr_name: glob failed for %s, ret=%d, errno=%d\n", + in_pattern, ret, errno); + return ret; + } + + if (results.gl_pathc > 1) { + int i; + int duplicate_name = 1; + + /* we encountered an issue where glob retuned same name twice, we suspect it to be + * an issue with glob/procfs. When there is more than one entry check whether all entries + * are the same in that case API succeeded and we use first entry name. + */ + for (i = 1; i < results.gl_pathc; i++) { + if (strcmp(results.gl_pathv[0], results.gl_pathv[i])) { + duplicate_name = 0; + break; + } + } + + if (!duplicate_name) { + fprintf(stderr, "mlx4_get_shared_mr_name failed for %s, unexpected %lu paths were found\n", + in_pattern, (unsigned long)(results.gl_pathc)); + for (i = 0; i < results.gl_pathc; i++) + fprintf(stderr, "mlx4_get_shared_mr_name: path#%d=%s\n", i, + results.gl_pathv[i]); + globfree(&results); + return -EINVAL; + } + } + + strncpy(file_name, results.gl_pathv[0], FPATH_MAX); + file_name[FPATH_MAX - 1] = '\0'; + globfree(&results); + return 0; +} + +struct ibv_mr *mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in) +{ + struct ibv_context *context; + size_t total_size; + int page_size; + char shared_mr_file_name[FPATH_MAX]; + char shared_mr_pattern[FPATH_MAX]; + int fd; + struct stat buffer; + int status; + struct ibv_mr *ibv_mr; + uint64_t shared_flags; + struct mlx4_mr *mlx4_mr = NULL; + void *addr = in->addr; + uint64_t access = in->exp_access; + struct ibv_exp_reg_mr_in rmr_in; + int flags; + int ret; + int is_writeable_mr = !!(access & (IBV_EXP_ACCESS_REMOTE_WRITE | + IBV_EXP_ACCESS_LOCAL_WRITE | IBV_EXP_ACCESS_REMOTE_ATOMIC)); + + context = in->pd->context; + page_size = to_mdev(context->device)->page_size; + sprintf(shared_mr_pattern, "%s/%X.*", + SHARED_MR_PROC_DIR_NAME, in->mr_handle); + + ret = mlx4_get_shared_mr_name(shared_mr_pattern, shared_mr_file_name); + if (ret) + /* For compatability issue trying with legacy name */ + sprintf(shared_mr_file_name, "%s/%X", + SHARED_MR_PROC_DIR_NAME, in->mr_handle); + + flags = is_writeable_mr ? O_RDWR : O_RDONLY; + fd = open(shared_mr_file_name, flags); + if (fd < 0) { + int counter = 10; + /* retrying for 1 second before reporting an error */ + while (fd < 0 && counter > 0) { + usleep(100000); + counter--; + fd = open(shared_mr_file_name, flags); + } + + if (fd < 0) { + fprintf(stderr, "mlx4_reg_shared_mr failed open %s errno=%d\n", + shared_mr_file_name, errno); + return NULL; + } + } + + status = fstat(fd, &buffer); + if (status) { + fprintf(stderr, + "mlx4_reg_shared_mr lstat has failed , errno=%d\n", + errno); + goto error; + } + + total_size = align(buffer.st_size, page_size); + + /* set protection based on access flags input address may be NULL + or other recommended address by the application. + */ + addr = mmap(addr , total_size, + is_writeable_mr ? (PROT_WRITE | PROT_READ) : + PROT_READ, MAP_SHARED, + fd, + 0); + + /* On a failure MAP_FAILED (that is, (void *) -1) is returned*/ + if (addr == MAP_FAILED) { + fprintf(stderr, + "mlx4_reg_shared_mr mmap has failed , errno=%d\n", + errno); + goto error; + } + + if (ibv_dontfork_range(addr, total_size)) { + fprintf(stderr, + "mlx4_reg_shared_mr dontfork has failed , errno=%d\n", + errno); + goto err_unmap; + } + + if (access & IBV_EXP_ACCESS_NO_RDMA) { + mlx4_mr = calloc(1, sizeof *mlx4_mr); + if (!mlx4_mr) + goto err_dofork; + + mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_NO_RDMA; + ibv_mr = &(mlx4_mr->ibv_mr); + ibv_mr->context = in->pd->context; + + } else { + /* Make sure that shared access flags are off before + calling to reg_mr, otherwise new mr will be shared as well. + */ + shared_flags = IBV_EXP_ACCESS_SHARED_MR_USER_READ | + IBV_EXP_ACCESS_SHARED_MR_USER_WRITE | + IBV_EXP_ACCESS_SHARED_MR_GROUP_READ | + IBV_EXP_ACCESS_SHARED_MR_GROUP_WRITE | + IBV_EXP_ACCESS_SHARED_MR_OTHER_READ | + IBV_EXP_ACCESS_SHARED_MR_OTHER_WRITE; + + access &= ~shared_flags; + rmr_in.pd = in->pd; + rmr_in.addr = addr; + rmr_in.length = total_size; + rmr_in.exp_access = access; + rmr_in.comp_mask = 0; + + ibv_mr = mlx4_exp_reg_mr(&rmr_in); + if (!ibv_mr) + goto err_dofork; + } + + /* file should be closed - not required any more */ + close(fd); + + ibv_mr->length = total_size; + ibv_mr->addr = addr; + mlx4_mr = to_mmr(ibv_mr); + /* We mark this MR as shared one to be handled correctly via dereg_mr*/ + mlx4_mr->shared_mr = 1; + /* We hook addr & length also internally for further + use via dreg_mr. + */ + mlx4_mr->buf.buf = addr; + mlx4_mr->buf.length = total_size; + return ibv_mr; + +err_dofork: + ibv_dofork_range(addr, total_size); +err_unmap: + munmap(addr, total_size); +error: + close(fd); + return NULL; +} + +int mlx4_exp_dereg_mr(struct ibv_mr *mr, struct ibv_exp_dereg_out *out) +{ + struct mlx4_mr *mlx4_mr = to_mmr(mr); + + out->need_dofork = (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR || + mlx4_mr->shared_mr) ? 0 : 1; + + return mlx4_dereg_mr(mr); +} + +int mlx4_exp_rereg_mr(struct ibv_mr *mr, + int flags, + struct ibv_pd *pd, void *addr, + size_t length, uint64_t access, + struct ibv_exp_rereg_mr_attr *attr, + struct ibv_exp_rereg_out *out) +{ + struct mlx4_mr *mlx4_mr = to_mmr(mr); + struct mlx4_buf buf; + struct ibv_exp_rereg_mr cmd; + struct ibv_exp_rereg_mr_resp resp; + int internal_alloc = 0; + int ret; + + if (flags & (~IBV_EXP_REREG_MR_FLAGS_SUPPORTED | IBV_EXP_REREG_MR_KEEP_VALID)) + return -EINVAL; + + /* Currently, we don't support any features in comp_mask */ + if (attr->comp_mask) + return -EINVAL; + + /* Here we check whether contigous pages are required and + should be allocated internally. + */ + + memset(&buf, 0, sizeof(buf)); + if ((flags & IBV_EXP_REREG_MR_CHANGE_ACCESS) && + !addr && (access & IBV_EXP_ACCESS_ALLOCATE_MR)) { + struct ibv_pd *curr_pd = flags & IBV_EXP_REREG_MR_CHANGE_PD ? pd : mr->pd; + addr = mlx4_get_contiguous_alloc(&buf, curr_pd, length, NULL); + if (!addr) + return -ENOMEM; + + internal_alloc = 1; + } + + ret = ibv_exp_cmd_rereg_mr(mr, flags, addr, length, + (uintptr_t) addr, + access, pd, attr, + &cmd, sizeof(cmd), 0, + &resp, sizeof(resp), 0); + + if (ret) { + if (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION) + mlx4_free_buf(&buf); + return ret; + } else { + if (((mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR) || + mlx4_mr->shared_mr) && + (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION)) { + mlx4_mr->shared_mr = 0; + mlx4_free_buf(&(mlx4_mr->buf)); + /* The memory was just freed, mark it as NULL */ + mlx4_mr->ibv_mr.addr = NULL; + mlx4_mr->allocation_flags &= ~IBV_EXP_ACCESS_ALLOCATE_MR; + out->need_dofork = 0; + } + if (internal_alloc) { + mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_ALLOCATE_MR; + /* Address is returned to libibverbs through pointer to + * pointer mechanism + */ + mlx4_mr->ibv_mr.addr = addr; + mlx4_mr->ibv_mr.length = length; + memcpy(&mlx4_mr->buf, &buf, sizeof(mlx4_mr->buf)); + } + } + + return ret; +} + + +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *attr) +{ + struct ibv_open_xrcd cmd; + struct ibv_open_xrcd_resp resp; + struct verbs_xrcd *xrcd; + int ret; + + xrcd = calloc(1, sizeof *xrcd); + if (!xrcd) + return NULL; + + ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + goto err; + + return &xrcd->xrcd; + +err: + free(xrcd); + return NULL; +} + +int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd) { - struct ibv_mr *mr; + struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd); + int ret; + + ret = ibv_cmd_close_xrcd(xrcd); + if (!ret) + free(xrcd); + + return ret; +} + +struct ibv_mr *mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in *in) +{ + + struct mlx4_mr *mlx4_mr; struct ibv_reg_mr cmd; int ret; + int cmd_access; + int is_contig; + + if ((in->comp_mask > IBV_EXP_REG_MR_RESERVED - 1) || + (in->exp_access > IBV_EXP_ACCESS_RESERVED - 1)) { + errno = EINVAL; + return NULL; + } - mr = malloc(sizeof *mr); - if (!mr) + mlx4_mr = calloc(1, sizeof *mlx4_mr); + if (!mlx4_mr) return NULL; + VALGRIND_MAKE_MEM_DEFINED(&in->create_flags, sizeof(in->create_flags)); + is_contig = ((in->exp_access & IBV_EXP_ACCESS_ALLOCATE_MR) && !in->addr) || + ((in->comp_mask & IBV_EXP_REG_MR_CREATE_FLAGS) && + (in->create_flags & IBV_EXP_REG_MR_CREATE_CONTIG)); + /* Here we check whether contigous pages are required and + should be allocated internally. + */ + if (is_contig) { + in->addr = mlx4_get_contiguous_alloc(&mlx4_mr->buf, in->pd, + in->length, in->addr); + if (!in->addr) { + free(mlx4_mr); + return NULL; + } + + mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_ALLOCATE_MR; + /* Hooking the addr on returned pointer for + further use by application. + */ + mlx4_mr->ibv_mr.addr = in->addr; + } + + cmd_access = (in->exp_access & (IBV_EXP_START_FLAG - 1)) | + (in->exp_access & (IBV_EXP_ACCESS_RESERVED - 1)) >> IBV_EXP_START_FLAG_LOC; #ifdef IBV_CMD_REG_MR_HAS_RESP_PARAMS { struct ibv_reg_mr_resp resp; - ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, - access, mr, &cmd, sizeof cmd, - &resp, sizeof resp); + ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length, + (uintptr_t) in->addr, cmd_access, + &(mlx4_mr->ibv_mr), + &cmd, sizeof(cmd), + &resp, sizeof(resp)); } #else - ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, access, mr, - &cmd, sizeof cmd); + ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length, + (uintptr_t) in->addr, cmd_access, + &(mlx4_mr->ibv_mr), + &cmd, sizeof(cmd)); #endif if (ret) { - free(mr); + mlx4_free_mr(mlx4_mr); return NULL; } - return mr; + return &(mlx4_mr->ibv_mr); +} + +struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + struct ibv_exp_reg_mr_in in; + + in.pd = pd; + in.addr = addr; + in.length = length; + in.exp_access = access; + in.comp_mask = 0; + + return mlx4_exp_reg_mr(&in); } int mlx4_dereg_mr(struct ibv_mr *mr) { int ret; + struct mlx4_mr *mlx4_mr = to_mmr(mr); + + if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_NO_RDMA) + goto free_mr; ret = ibv_cmd_dereg_mr(mr); if (ret) return ret; +free_mr: + mlx4_free_mr(mlx4_mr); + return 0; +} + +struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) +{ + struct verbs_mw *vmw; + struct ibv_alloc_mw cmd; + struct ibv_alloc_mw_resp resp; + int ret; + + vmw = malloc(sizeof(*vmw)); + if (!vmw) + return NULL; + memset(vmw, 0, sizeof(*vmw)); + + ret = ibv_cmd_alloc_mw(pd, type, vmw, &cmd, sizeof(cmd), + &resp, sizeof(resp)); + + if (ret) { + free(vmw); + return NULL; + } + vmw->type = type; + + return &vmw->mw; +} + +int mlx4_dealloc_mw(struct ibv_mw *mw) +{ + int ret; + struct ibv_dealloc_mw cmd; + struct verbs_mw *vmw = (struct verbs_mw *)mw; + + ret = ibv_cmd_dealloc_mw(vmw, &cmd, sizeof(cmd)); + if (ret) + return ret; + + free(vmw); + return 0; +} + +int __mlx4_bind_mw(struct ibv_exp_mw_bind *mw_bind) +{ + int ret; + struct ibv_exp_send_wr *bad_wr = NULL; + struct ibv_exp_send_wr wr = { }; + + wr.exp_opcode = IBV_EXP_WR_BIND_MW; + wr.next = NULL; + + wr.wr_id = mw_bind->wr_id; + wr.exp_send_flags = mw_bind->exp_send_flags; + + wr.bind_mw.mw = mw_bind->mw; + wr.bind_mw.rkey = ibv_inc_rkey(mw_bind->mw->rkey); + wr.bind_mw.bind_info = mw_bind->bind_info; + + ret = mlx4_exp_post_send(mw_bind->qp, &wr, &bad_wr); + + if (ret) + return ret; + + /* updating the mw with the latest rkey. */ + mw_bind->mw->rkey = wr.bind_mw.rkey; - free(mr); return 0; } -static int align_queue_size(int req) +int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + struct ibv_exp_mw_bind exp_mw_bind; + + memset(&exp_mw_bind, 0, sizeof(exp_mw_bind)); + exp_mw_bind.qp = qp; + exp_mw_bind.exp_send_flags = mw_bind->send_flags; + exp_mw_bind.wr_id = mw_bind->wr_id; + exp_mw_bind.bind_info.addr = (uint64_t)(uintptr_t)mw_bind->addr; + exp_mw_bind.bind_info.length = mw_bind->length; + exp_mw_bind.bind_info.mr = mw_bind->mr; + exp_mw_bind.bind_info.exp_mw_access_flags = mw_bind->mw_access_flags; + exp_mw_bind.comp_mask = 0; + + return __mlx4_bind_mw(&exp_mw_bind); + +} + +int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind) +{ + if (mw_bind->comp_mask > IBV_EXP_BIND_MW_RESERVED - 1) + return EINVAL; + return __mlx4_bind_mw(mw_bind); +} + +int align_queue_size(int req) { int nent; @@ -160,36 +773,52 @@ return nent; } -struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector) +static struct ibv_cq *create_cq(struct ibv_context *context, + int cqe, + struct ibv_comp_channel *channel, + int comp_vector, + struct ibv_exp_cq_init_attr *attr) { - struct mlx4_create_cq cmd; - struct mlx4_create_cq_resp resp; - struct mlx4_cq *cq; - int ret; - struct mlx4_context *mctx = to_mctx(context); + struct mlx4_create_cq cmd; + struct mlx4_exp_create_cq cmd_e; + struct mlx4_create_cq_resp resp; + struct mlx4_cq *cq; + int ret; + struct mlx4_context *mctx = to_mctx(context); + int thread_safe; /* Sanity check CQ size before proceeding */ if (cqe > 0x3fffff) return NULL; - cq = malloc(sizeof *cq); + cq = calloc(1, sizeof(*cq)); if (!cq) return NULL; cq->cons_index = 0; + cq->wait_index = 0; + cq->wait_count = 0; - if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) + thread_safe = !mlx4_single_threaded; + if (attr && (attr->comp_mask & IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN)) { + if (!attr->res_domain) { + errno = EINVAL; + goto err; + } + thread_safe = (to_mres_domain(attr->res_domain)->attr.thread_model == IBV_EXP_THREAD_SAFE); + } + + if (mlx4_lock_init(&cq->lock, thread_safe, mlx4_get_locktype())) goto err; + cq->model_flags = thread_safe ? MLX4_CQ_MODEL_FLAG_THREAD_SAFE : 0; + cqe = align_queue_size(cqe + 1); - if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe, mctx->cqe_size)) + if (mlx4_alloc_cq_buf(to_mctx(context), &cq->buf, cqe, mctx->cqe_size)) goto err; cq->cqe_size = mctx->cqe_size; - cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ); if (!cq->set_ci_db) goto err_buf; @@ -199,16 +828,41 @@ cq->arm_sn = 1; *cq->set_ci_db = 0; - cmd.buf_addr = (uintptr_t) cq->buf.buf; - cmd.db_addr = (uintptr_t) cq->set_ci_db; - - ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector, - &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd, - &resp.ibv_resp, sizeof resp); + if (NULL != attr) { + cmd_e.buf_addr = (uintptr_t) cq->buf.buf; + cmd_e.db_addr = (uintptr_t) cq->set_ci_db; + } else { + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + } + if (NULL != attr) { + ret = ibv_exp_cmd_create_cq(context, cqe - 1, channel, + comp_vector, &cq->ibv_cq, + &cmd_e.ibv_cmd, + sizeof(cmd_e.ibv_cmd), + sizeof(cmd_e) - sizeof(cmd_e.ibv_cmd), + &resp.ibv_resp, + sizeof(resp.ibv_resp), + sizeof(resp) - sizeof(resp.ibv_resp), + attr); + } else { + ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector, + &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof(resp)); + } if (ret) goto err_db; cq->cqn = resp.cqn; + cq->stall_next_poll = 0; + cq->stall_enable = mctx->stall_enable; + if (NULL != attr && attr->comp_mask) { + if (cmd_e.ibv_cmd.comp_mask & IBV_EXP_CREATE_CQ_CAP_FLAGS) { + cq->creation_flags = attr->flags; + } + } + + cq->pattern = MLX4_CQ_PATTERN; return &cq->ibv_cq; @@ -216,14 +870,41 @@ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db); err_buf: - mlx4_free_buf(&cq->buf); - + if (cq->buf.hmem != NULL) + mlx4_free_buf_huge(to_mctx(context), &cq->buf); + else + mlx4_free_buf(&cq->buf); err: free(cq); return NULL; } +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + read_init_vars(to_mctx(context)); + return create_cq(context, cqe, channel, comp_vector, NULL); +} + +struct ibv_cq *mlx4_create_cq_ex(struct ibv_context *context, + int cqe, + struct ibv_comp_channel *channel, + int comp_vector, + struct ibv_exp_cq_init_attr *attr) +{ + return create_cq(context, cqe, channel, comp_vector, attr); +} + +int mlx4_modify_cq(struct ibv_cq *cq, + struct ibv_exp_cq_attr *attr, + int attr_mask) +{ + struct ibv_exp_modify_cq cmd; + return ibv_exp_cmd_modify_cq(cq, attr, attr_mask, &cmd, sizeof(cmd)); +} + int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe) { struct mlx4_cq *cq = to_mcq(ibcq); @@ -235,7 +916,7 @@ if (cqe > 0x3fffff) return EINVAL; - pthread_spin_lock(&cq->lock); + mlx4_lock(&cq->lock); cqe = align_queue_size(cqe + 1); if (cqe == ibcq->cqe + 1) { @@ -250,7 +931,7 @@ goto out; } - ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, + ret = mlx4_alloc_cq_buf(to_mctx(ibcq->context), &buf, cqe, cq->cqe_size); if (ret) goto out; @@ -268,17 +949,24 @@ ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd); #endif if (ret) { - mlx4_free_buf(&buf); + if (cq->buf.hmem != NULL) + mlx4_free_buf_huge(to_mctx(ibcq->context), &buf); + else + mlx4_free_buf(&buf); goto out; } mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe); - mlx4_free_buf(&cq->buf); - cq->buf = buf; + if (cq->buf.hmem != NULL) + mlx4_free_buf_huge(to_mctx(ibcq->context), &cq->buf); + else + mlx4_free_buf(&cq->buf); + cq->buf = buf; + mlx4_update_cons_index(cq); out: - pthread_spin_unlock(&cq->lock); + mlx4_unlock(&cq->lock); return ret; } @@ -291,14 +979,32 @@ return ret; mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db); - mlx4_free_buf(&to_mcq(cq)->buf); + if (to_mcq(cq)->buf.hmem != NULL) + mlx4_free_buf_huge(to_mctx(cq->context), &to_mcq(cq)->buf); + else + mlx4_free_buf(&to_mcq(cq)->buf); free(to_mcq(cq)); return 0; } +void *mlx4_get_legacy_xrc(struct ibv_srq *srq) +{ + struct mlx4_srq *msrq = to_msrq(srq); + + return msrq->ibv_srq_legacy; +} + +void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq) +{ + struct mlx4_srq *msrq = to_msrq(srq); + + msrq->ibv_srq_legacy = legacy_xrc_srq; + return; +} + struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, - struct ibv_srq_init_attr *attr) + struct ibv_srq_init_attr *attr) { struct mlx4_create_srq cmd; struct mlx4_create_srq_resp resp; @@ -309,16 +1015,17 @@ if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) return NULL; - srq = malloc(sizeof *srq); + srq = calloc(1, sizeof *srq); if (!srq) return NULL; - if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded)) goto err; srq->max = align_queue_size(attr->attr.max_wr + 1); srq->max_gs = attr->attr.max_sge; srq->counter = 0; + srq->ext_srq = 0; if (mlx4_alloc_srq_buf(pd, &attr->attr, srq)) goto err; @@ -332,15 +1039,13 @@ cmd.buf_addr = (uintptr_t) srq->buf.buf; cmd.db_addr = (uintptr_t) srq->db; - ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, + ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr, &cmd.ibv_cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); if (ret) goto err_db; - srq->srqn = resp.srqn; - - return &srq->ibv_srq; + return &srq->verbs_srq.srq; err_db: mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db); @@ -355,12 +1060,27 @@ return NULL; } +struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex) +{ + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) || + (attr_ex->srq_type == IBV_SRQT_BASIC)) + return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex); + else if (attr_ex->srq_type == IBV_SRQT_XRC) + return mlx4_create_xrc_srq(context, attr_ex); + + return NULL; +} + int mlx4_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, - enum ibv_srq_attr_mask attr_mask) + int attr_mask) { struct ibv_modify_srq cmd; + if (srq->handle == LEGACY_XRC_SRQ_HANDLE) + srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq); + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); } @@ -369,199 +1089,98 @@ { struct ibv_query_srq cmd; + if (srq->handle == LEGACY_XRC_SRQ_HANDLE) + srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq); + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); } -int mlx4_destroy_srq(struct ibv_srq *ibsrq) +int mlx4_destroy_srq(struct ibv_srq *srq) { - struct mlx4_srq *srq = to_msrq(ibsrq); - struct mlx4_cq *mcq = NULL; int ret; + struct ibv_srq *legacy_srq = NULL; - if (ibsrq->xrc_cq) { - /* is an xrc_srq */ - mcq = to_mcq(ibsrq->xrc_cq); - mlx4_cq_clean(mcq, 0, srq); - pthread_spin_lock(&mcq->lock); - mlx4_clear_xrc_srq(to_mctx(ibsrq->context), srq->srqn); - pthread_spin_unlock(&mcq->lock); + if (srq->handle == LEGACY_XRC_SRQ_HANDLE) { + legacy_srq = srq; + srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq); } - ret = ibv_cmd_destroy_srq(ibsrq); - if (ret) { - if (ibsrq->xrc_cq) { - pthread_spin_lock(&mcq->lock); - mlx4_store_xrc_srq(to_mctx(ibsrq->context), - srq->srqn, srq); - pthread_spin_unlock(&mcq->lock); - } - return ret; + if (to_msrq(srq)->ext_srq) { + ret = mlx4_destroy_xrc_srq(srq); + if (ret) + return ret; + + if (legacy_srq) + free(legacy_srq); + + return 0; } - mlx4_free_db(to_mctx(ibsrq->context), MLX4_DB_TYPE_RQ, srq->db); - mlx4_free_buf(&srq->buf); - free(srq->wrid); - free(srq); + ret = ibv_cmd_destroy_srq(srq); + if (ret) + return ret; + + mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db); + mlx4_free_buf(&to_msrq(srq)->buf); + free(to_msrq(srq)->wrid); + free(to_msrq(srq)); return 0; } -static int verify_sizes(struct ibv_qp_init_attr *attr, struct mlx4_context *context) +struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr) { - int size; - int nsegs; - - if (attr->cap.max_send_wr > context->max_qp_wr || - attr->cap.max_recv_wr > context->max_qp_wr || - attr->cap.max_send_sge > context->max_sge || - attr->cap.max_recv_sge > context->max_sge) - return -1; - - if (attr->cap.max_inline_data) { - nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type); - size = MLX4_MAX_WQE_SIZE - nsegs * sizeof (struct mlx4_wqe_inline_seg); - switch (attr->qp_type) { - case IBV_QPT_UD: - size -= (sizeof (struct mlx4_wqe_ctrl_seg) + - sizeof (struct mlx4_wqe_datagram_seg)); - break; - - case IBV_QPT_RC: - case IBV_QPT_UC: - case IBV_QPT_XRC: - size -= (sizeof (struct mlx4_wqe_ctrl_seg) + - sizeof (struct mlx4_wqe_raddr_seg)); - break; - - default: - return 0; - } - - if (attr->cap.max_inline_data > size) - return -1; - } - - return 0; + read_init_vars(to_mctx(context)); + return mlx4_exp_create_qp(context, (struct ibv_exp_qp_init_attr *)attr); } struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) { - struct mlx4_create_qp cmd; - struct ibv_create_qp_resp resp; - struct mlx4_qp *qp; - int ret; - struct mlx4_context *context = to_mctx(pd->context); - + struct ibv_exp_qp_init_attr attr_exp; + struct ibv_qp *qp; + /* We should copy below only the shared fields excluding the xrc_domain field. + * Otherwise we may have an ABI issue with applications that were compiled + * without the xrc_domain field. The xrc_domain any way has no affect in + * the sender side, no need to copy in/out. + */ + int init_attr_base_size = offsetof(struct ibv_qp_init_attr, xrc_domain); + + /* copying only shared fields */ + memcpy(&attr_exp, attr, init_attr_base_size); + attr_exp.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; + attr_exp.pd = pd; + qp = mlx4_exp_create_qp(pd->context, &attr_exp); + if (qp) + memcpy(attr, &attr_exp, init_attr_base_size); + return qp; +} - /* Sanity check QP size before proceeding */ - if (verify_sizes(attr, context)) - return NULL; +struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr) +{ + struct ibv_open_qp cmd; + struct ibv_create_qp_resp resp; + struct mlx4_qp *qp; + int ret; - qp = malloc(sizeof *qp); + qp = calloc(1, sizeof *qp); if (!qp) return NULL; - mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp); - - /* - * We need to leave 2 KB + 1 WQE of headroom in the SQ to - * allow HW to prefetch. - */ - qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; - qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes); - qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); - - if (attr->srq || attr->qp_type == IBV_QPT_XRC) - attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0; - else { - if (attr->cap.max_recv_sge < 1) - attr->cap.max_recv_sge = 1; - if (attr->cap.max_recv_wr < 1) - attr->cap.max_recv_wr = 1; - } - - if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp)) - goto err; - - mlx4_init_qp_indices(qp); - - if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || - pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) - goto err_free; - - if (!attr->srq && attr->qp_type != IBV_QPT_XRC) { - qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); - if (!qp->db) - goto err_free; - - *qp->db = 0; - } - - cmd.buf_addr = (uintptr_t) qp->buf.buf; - if (attr->srq || attr->qp_type == IBV_QPT_XRC) - cmd.db_addr = 0; - else - cmd.db_addr = (uintptr_t) qp->db; - cmd.log_sq_stride = qp->sq.wqe_shift; - for (cmd.log_sq_bb_count = 0; - qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count; - ++cmd.log_sq_bb_count) - ; /* nothing */ - cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */ - memset(cmd.reserved, 0, sizeof cmd.reserved); - - pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex); - - ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd, - &resp, sizeof resp); - if (ret) - goto err_rq_db; - - ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp); + ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, + &cmd, sizeof cmd, &resp, sizeof resp); if (ret) - goto err_destroy; - pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); - - qp->rq.wqe_cnt = attr->cap.max_recv_wr; - qp->rq.max_gs = attr->cap.max_recv_sge; - - /* adjust rq maxima to not exceed reported device maxima */ - attr->cap.max_recv_wr = min(context->max_qp_wr, attr->cap.max_recv_wr); - attr->cap.max_recv_sge = min(context->max_sge, attr->cap.max_recv_sge); - - qp->rq.max_post = attr->cap.max_recv_wr; - mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type); - - qp->doorbell_qpn = htonl(qp->ibv_qp.qp_num << 8); - if (attr->sq_sig_all) - qp->sq_signal_bits = htonl(MLX4_WQE_CTRL_CQ_UPDATE); - else - qp->sq_signal_bits = 0; - - return &qp->ibv_qp; - -err_destroy: - ibv_cmd_destroy_qp(&qp->ibv_qp); - -err_rq_db: - pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); - if (!attr->srq && attr->qp_type != IBV_QPT_XRC) - mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db); + goto err; -err_free: - free(qp->sq.wrid); - if (qp->rq.wqe_cnt) - free(qp->rq.wrid); - mlx4_free_buf(&qp->buf); + return &qp->verbs_qp.qp; err: free(qp); - return NULL; } int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, - enum ibv_qp_attr_mask attr_mask, + int attr_mask, struct ibv_qp_init_attr *init_attr) { struct ibv_query_qp cmd; @@ -582,11 +1201,17 @@ } int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, - enum ibv_qp_attr_mask attr_mask) + int attr_mask) { struct ibv_modify_qp cmd; int ret; + if (attr_mask & IBV_QP_PORT) { + ret = update_port_data(qp, attr->port_num); + if (ret) + return ret; + } + if (qp->state == IBV_QPS_RESET && attr_mask & IBV_QP_STATE && attr->qp_state == IBV_QPS_INIT) { @@ -598,13 +1223,14 @@ if (!ret && (attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RESET) { - mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, - qp->srq ? to_msrq(qp->srq) : NULL); - if (qp->send_cq != qp->recv_cq) + if (qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq && qp->send_cq != qp->recv_cq) mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); mlx4_init_qp_indices(to_mqp(qp)); - if (!qp->srq && qp->qp_type != IBV_QPT_XRC) + if (to_mqp(qp)->rq.wqe_cnt) *to_mqp(qp)->db = 0; } @@ -616,14 +1242,19 @@ struct mlx4_cq *send_cq = to_mcq(qp->send_cq); struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); - if (send_cq == recv_cq) - pthread_spin_lock(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { - pthread_spin_lock(&send_cq->lock); - pthread_spin_lock(&recv_cq->lock); + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + mlx4_lock(&send_cq->lock); + else if (qp->recv_cq) + mlx4_lock(&recv_cq->lock); + } else if (send_cq == recv_cq) { + mlx4_lock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + mlx4_lock(&send_cq->lock); + mlx4_lock(&recv_cq->lock); } else { - pthread_spin_lock(&recv_cq->lock); - pthread_spin_lock(&send_cq->lock); + mlx4_lock(&recv_cq->lock); + mlx4_lock(&send_cq->lock); } } @@ -632,14 +1263,20 @@ struct mlx4_cq *send_cq = to_mcq(qp->send_cq); struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); - if (send_cq == recv_cq) - pthread_spin_unlock(&send_cq->lock); - else if (send_cq->cqn < recv_cq->cqn) { - pthread_spin_unlock(&recv_cq->lock); - pthread_spin_unlock(&send_cq->lock); + + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + mlx4_unlock(&send_cq->lock); + else if (qp->recv_cq) + mlx4_unlock(&recv_cq->lock); + } else if (send_cq == recv_cq) { + mlx4_unlock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + mlx4_unlock(&recv_cq->lock); + mlx4_unlock(&send_cq->lock); } else { - pthread_spin_unlock(&send_cq->lock); - pthread_spin_unlock(&recv_cq->lock); + mlx4_unlock(&send_cq->lock); + mlx4_unlock(&recv_cq->lock); } } @@ -656,246 +1293,120 @@ } mlx4_lock_cqs(ibqp); - - __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, - ibqp->srq ? to_msrq(ibqp->srq) : NULL); - if (ibqp->send_cq != ibqp->recv_cq) + if (ibqp->recv_cq) + __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq) __mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL); - mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) + mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); mlx4_unlock_cqs(ibqp); pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); - if (!ibqp->srq && ibqp->qp_type != IBV_QPT_XRC) - mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db); - free(qp->sq.wrid); + /* + * Use the qp->bf to check if the QP is using dedicated BF. + * If so, update the dedicated BF database. + */ + if (qp->bf && (&qp->bf->cmn != &(to_mctx(ibqp->context)->bfs.cmn_bf))) { + struct mlx4_bfs_data *bfs = &to_mctx(ibqp->context)->bfs; + int idx = &(qp->bf->dedic) - bfs->dedic_bf; + + if (0 <= idx && idx < (MLX4_MAX_BFS_IN_PAGE - 1)) { + mlx4_spin_lock(&bfs->dedic_bf_lock); + bfs->dedic_bf_used[idx] = 0; + bfs->dedic_bf_free++; + mlx4_spin_unlock(&bfs->dedic_bf_lock); + } + } + if (qp->rq.wqe_cnt) - free(qp->rq.wrid); - mlx4_free_buf(&qp->buf); + mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db); + + mlx4_dealloc_qp_buf(ibqp->context, qp); + free(qp); return 0; } -struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +struct ibv_ah *mlx4_create_ah_common(struct ibv_pd *pd, + struct ibv_ah_attr *attr, + uint8_t link_layer) { struct mlx4_ah *ah; - struct ibv_port_attr port_attr; - uint8_t is_mcast; + + if (unlikely(!attr->dlid) && + (link_layer != IBV_LINK_LAYER_ETHERNET)) { + errno = EINVAL; + return NULL; + } ah = malloc(sizeof *ah); if (!ah) return NULL; - memset(ah, 0, sizeof *ah); + memset(&ah->av, 0, sizeof ah->av); ah->av.port_pd = htonl(to_mpd(pd)->pdn | (attr->port_num << 24)); - ah->av.g_slid = attr->src_path_bits; - ah->av.dlid = htons(attr->dlid); + + if (link_layer != IBV_LINK_LAYER_ETHERNET) { + ah->av.g_slid = attr->src_path_bits; + ah->av.dlid = htons(attr->dlid); + ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28); + } else { + ah->vlan = ((attr->sl & 7) << 13); + ah->av.sl_tclass_flowlabel = htonl(attr->sl << 29); + } + if (attr->static_rate) { ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET; /* XXX check rate cap? */ } - ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28); if (attr->is_global) { ah->av.g_slid |= 0x80; ah->av.gid_index = attr->grh.sgid_index; - ah->av.hop_limit = attr->grh.hop_limit; + if (attr->grh.hop_limit < 2) + ah->av.hop_limit = 0xff; + else + ah->av.hop_limit = attr->grh.hop_limit; ah->av.sl_tclass_flowlabel |= htonl((attr->grh.traffic_class << 20) | attr->grh.flow_label); memcpy(ah->av.dgid, attr->grh.dgid.raw, 16); } - if (ibv_query_port(pd->context, attr->port_num, &port_attr)) - goto err; - - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { - if (ibv_resolve_eth_gid(pd, attr->port_num, - (union ibv_gid *)ah->av.dgid, - attr->grh.sgid_index, - ah->mac, &ah->vlan, - &ah->tagged, &is_mcast)) - goto err; - - if (is_mcast) { - ah->av.dlid = htons(0xc000); - ah->av.port_pd |= htonl(1 << 31); - } - if (ah->tagged) { - ah->av.port_pd |= htonl(1 << 29); - ah->vlan |= (attr->sl & 7) << 13; - } - } - - return &ah->ibv_ah; -err: - free(ah); - return NULL; } -int mlx4_destroy_ah(struct ibv_ah *ah) -{ - free(to_mah(ah)); - - return 0; -} - -#ifdef HAVE_IBV_XRC_OPS -struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd, - struct ibv_xrc_domain *xrc_domain, - struct ibv_cq *xrc_cq, - struct ibv_srq_init_attr *attr) +struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { - struct mlx4_create_xrc_srq cmd; - struct mlx4_create_srq_resp resp; - struct mlx4_srq *srq; - int ret; - - /* Sanity check SRQ size before proceeding */ - if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) - return NULL; - - srq = malloc(sizeof *srq); - if (!srq) - return NULL; - - if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) - goto err; - - srq->max = align_queue_size(attr->attr.max_wr + 1); - srq->max_gs = attr->attr.max_sge; - srq->counter = 0; - - if (mlx4_alloc_srq_buf(pd, &attr->attr, srq)) - goto err; - - srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); - if (!srq->db) - goto err_free; - - *srq->db = 0; - - cmd.buf_addr = (uintptr_t) srq->buf.buf; - cmd.db_addr = (uintptr_t) srq->db; - - ret = ibv_cmd_create_xrc_srq(pd, &srq->ibv_srq, attr, - xrc_domain->handle, - xrc_cq->handle, - &cmd.ibv_cmd, sizeof cmd, - &resp.ibv_resp, sizeof resp); - if (ret) - goto err_db; + struct ibv_ah *ah; + struct ibv_exp_port_attr port_attr; + struct ibv_port_attr port_attr_legacy; + uint8_t link_layer; - srq->ibv_srq.xrc_srq_num = srq->srqn = resp.srqn; + port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1; + port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER; - ret = mlx4_store_xrc_srq(to_mctx(pd->context), srq->ibv_srq.xrc_srq_num, srq); - if (ret) - goto err_destroy; - - return &srq->ibv_srq; - -err_destroy: - ibv_cmd_destroy_srq(&srq->ibv_srq); - -err_db: - mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db); - -err_free: - free(srq->wrid); - mlx4_free_buf(&srq->buf); + if (ibv_exp_query_port(pd->context, attr->port_num, &port_attr)) { + if (ibv_query_port(pd->context, attr->port_num, &port_attr_legacy)) + return NULL; -err: - free(srq); - - return NULL; -} - -struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context, - int fd, int oflag) -{ - int ret; - struct mlx4_open_xrc_domain_resp resp; - struct mlx4_xrc_domain *xrcd; - - xrcd = malloc(sizeof *xrcd); - if (!xrcd) - return NULL; - - ret = ibv_cmd_open_xrc_domain(context, fd, oflag, &xrcd->ibv_xrcd, - &resp.ibv_resp, sizeof resp); - if (ret) { - free(xrcd); - return NULL; + link_layer = port_attr_legacy.link_layer; + } else { + link_layer = port_attr.link_layer; } - xrcd->xrcdn = resp.xrcdn; - return &xrcd->ibv_xrcd; -} - -int mlx4_close_xrc_domain(struct ibv_xrc_domain *d) -{ - int ret; - ret = ibv_cmd_close_xrc_domain(d); - if (!ret) - free(d); - return ret; -} - -int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr, - uint32_t *xrc_qp_num) -{ - - return ibv_cmd_create_xrc_rcv_qp(init_attr, xrc_qp_num); -} + ah = mlx4_create_ah_common(pd, attr, link_layer); -int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, - int attr_mask) -{ - return ibv_cmd_modify_xrc_rcv_qp(xrc_domain, xrc_qp_num, - attr, attr_mask); + return ah; } -int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num, - struct ibv_qp_attr *attr, - int attr_mask, - struct ibv_qp_init_attr *init_attr) +int mlx4_destroy_ah(struct ibv_ah *ah) { - int ret; - - ret = ibv_cmd_query_xrc_rcv_qp(xrc_domain, xrc_qp_num, - attr, attr_mask, init_attr); - if (ret) - return ret; - - init_attr->cap.max_send_wr = init_attr->cap.max_send_sge = 1; - init_attr->cap.max_recv_sge = init_attr->cap.max_recv_wr = 0; - init_attr->cap.max_inline_data = 0; - init_attr->recv_cq = init_attr->send_cq = NULL; - init_attr->srq = NULL; - init_attr->xrc_domain = xrc_domain; - init_attr->qp_type = IBV_QPT_XRC; - init_attr->qp_context = NULL; - attr->cap = init_attr->cap; + free(to_mah(ah)); return 0; } - -int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num) -{ - return ibv_cmd_reg_xrc_rcv_qp(xrc_domain, xrc_qp_num); -} - -int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain, - uint32_t xrc_qp_num) -{ - return ibv_cmd_unreg_xrc_rcv_qp(xrc_domain, xrc_qp_num); -} - -#endif Index: contrib/ofed/libmlx4/src/verbs_exp.c =================================================================== --- /dev/null +++ contrib/ofed/libmlx4/src/verbs_exp.c @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* Added for reg_mr mmap munmap system calls */ +#include +#include "mlx4.h" +#include "mlx4-abi.h" +#include "mlx4_exp.h" +#include "wqe.h" + +static const char *qptype2key(enum ibv_qp_type type) +{ + switch (type) { + case IBV_QPT_RC: return "HUGE_RC"; + case IBV_QPT_UC: return "HUGE_UC"; + case IBV_QPT_UD: return "HUGE_UD"; +#ifdef _NOT_EXISTS_IN_OFED_2_0 + case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH"; +#endif + + default: return "HUGE_NA"; + } +} + +static void update_qp_cap_cache(struct ibv_qp *qp) +{ + struct mlx4_context *ctx = to_mctx(qp->context); + struct mlx4_qp *mqp = to_mqp(qp); + + if (((qp->qp_type == IBV_QPT_RAW_ETH) && (mqp->link_layer == IBV_LINK_LAYER_ETHERNET)) && + (ctx->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_IP_PKT)) + mqp->qp_cap_cache |= MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP; +} + +int update_port_data(struct ibv_qp *qp, uint8_t port_num) +{ + struct mlx4_qp *mqp = to_mqp(qp); + struct ibv_port_attr port_attr; + int err; + + err = ibv_query_port(qp->context, port_num, &port_attr); + if (err) + return err; + + mqp->link_layer = port_attr.link_layer; + update_qp_cap_cache(qp); + + return 0; +} + +int mlx4_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr, + uint64_t attr_mask) +{ + struct ibv_exp_modify_qp cmd; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + if (attr_mask & IBV_QP_PORT) { + ret = update_port_data(qp, attr->port_num); + if (ret) + return ret; + } + + if (qp->state == IBV_QPS_RESET && + (attr_mask & IBV_EXP_QP_STATE) && + attr->qp_state == IBV_QPS_INIT) { + mlx4_qp_init_sq_ownership(to_mqp(qp)); + } + + + ret = ibv_exp_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); + + if (!ret && + (attr_mask & IBV_EXP_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + if (qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq && qp->send_cq != qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); + + mlx4_init_qp_indices(to_mqp(qp)); + if (to_mqp(qp)->rq.wqe_cnt) + *to_mqp(qp)->db = 0; + } + + return ret; +} + +static int verify_sizes(struct ibv_exp_qp_init_attr *attr, struct mlx4_context *context) +{ + int size; + int nsegs; + + if (attr->cap.max_send_wr > context->max_qp_wr || + attr->cap.max_recv_wr > context->max_qp_wr || + attr->cap.max_send_sge > context->max_sge || + attr->cap.max_recv_sge > context->max_sge) + return -1; + + if (attr->cap.max_inline_data) { + nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type); + size = MLX4_MAX_WQE_SIZE - nsegs * sizeof(struct mlx4_wqe_inline_seg); + switch (attr->qp_type) { + case IBV_QPT_UD: + size -= (sizeof(struct mlx4_wqe_ctrl_seg) + + sizeof(struct mlx4_wqe_datagram_seg)); + break; + + case IBV_QPT_RC: + case IBV_QPT_UC: + size -= (sizeof(struct mlx4_wqe_ctrl_seg) + + sizeof(struct mlx4_wqe_raddr_seg)); + break; + + default: + return 0; + } + + if (attr->cap.max_inline_data > size) + return -1; + } + + return 0; +} + +static int mlx4_exp_alloc_qp_buf(struct ibv_context *context, + struct ibv_exp_qp_init_attr *attr, + struct mlx4_qp *qp) +{ + int ret; + enum mlx4_alloc_type alloc_type; + enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG; + const char *qp_huge_key; + int i, wqe_size; + + qp->rq.max_gs = attr->cap.max_recv_sge; + wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg); + if ((attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) && (attr->max_inl_recv)) { + qp->max_inlr_sg = qp->rq.max_gs; + wqe_size = max(wqe_size, attr->max_inl_recv); + } + for (qp->rq.wqe_shift = 4; 1 << qp->rq.wqe_shift < wqe_size; qp->rq.wqe_shift++) + ; /* nothing */ + + if (qp->max_inlr_sg) { + attr->max_inl_recv = 1 << qp->rq.wqe_shift; + qp->max_inlr_sg = attr->max_inl_recv / sizeof(struct mlx4_wqe_data_seg); + } + + if (qp->sq.wqe_cnt) { + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t)); + if (!qp->sq.wrid) + return -1; + } + + if (qp->rq.wqe_cnt) { + qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t)); + if (!qp->rq.wrid) { + free(qp->sq.wrid); + return -1; + } + + if (qp->max_inlr_sg) { + qp->inlr_buff.buff = malloc(qp->rq.wqe_cnt * sizeof(*(qp->inlr_buff.buff))); + if (!qp->inlr_buff.buff) { + free(qp->sq.wrid); + free(qp->rq.wrid); + return -1; + } + qp->inlr_buff.len = qp->rq.wqe_cnt; + qp->inlr_buff.buff[0].sg_list = malloc(qp->rq.wqe_cnt * + sizeof(*(qp->inlr_buff.buff->sg_list)) * + qp->max_inlr_sg); + if (!qp->inlr_buff.buff->sg_list) { + free(qp->sq.wrid); + free(qp->rq.wrid); + free(qp->inlr_buff.buff); + return -1; + } + for (i = 1; i < qp->rq.wqe_cnt; i++) + qp->inlr_buff.buff[i].sg_list = &qp->inlr_buff.buff[0].sg_list[i * qp->max_inlr_sg]; + } + } + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + + if (qp->buf_size) { + /* compatability support */ + qp_huge_key = qptype2key(attr->qp_type); + if (mlx4_use_huge(context, qp_huge_key)) + default_alloc_type = MLX4_ALLOC_TYPE_HUGE; + + + mlx4_get_alloc_type(context, MLX4_QP_PREFIX, &alloc_type, + default_alloc_type); + + ret = mlx4_alloc_prefered_buf(to_mctx(context), &qp->buf, + align(qp->buf_size, to_mdev + (context->device)->page_size), + to_mdev(context->device)->page_size, + alloc_type, + MLX4_QP_PREFIX); + + if (ret) { + free(qp->sq.wrid); + free(qp->rq.wrid); + if (qp->max_inlr_sg) { + free(qp->inlr_buff.buff[0].sg_list); + free(qp->inlr_buff.buff); + } + return -1; + } + + memset(qp->buf.buf, 0, qp->buf_size); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.buf = qp->buf.buf; + qp->sq.buf = qp->buf.buf + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + } else { + qp->rq.buf = qp->buf.buf + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + qp->sq.buf = qp->buf.buf; + } + + } else { + qp->buf.buf = NULL; + } + + return 0; +} + +static uint64_t send_db_to_uar(uintptr_t send_db) +{ + return (send_db - MLX4_SEND_DOORBELL); +} + +static uint32_t *uar_to_send_db(uintptr_t uar) +{ + return (uint32_t *)(uar + MLX4_SEND_DOORBELL); +} + +static void update_qp_bf_data(struct mlx4_res_domain *res_domain, + struct mlx4_qp *qp, struct ibv_context *context) +{ + switch (res_domain->type) { + case MLX4_RES_DOMAIN_BF_SAFE: + qp->db_method = MLX4_QP_DB_METHOD_BF; + break; + case MLX4_RES_DOMAIN_BF_UNSAFE: + qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF; + break; + case MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT: + if (to_mctx(context)->prefer_bf) + qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB; + else + qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB; + break; + default: + break; + } + qp->bf = &res_domain->send_db->bf; + qp->sdb = res_domain->send_db->db_addr; + qp->bf_buf_size = to_mctx(context)->bfs.buf_size; +} + +struct ibv_qp *mlx4_exp_create_qp(struct ibv_context *context, + struct ibv_exp_qp_init_attr *attr) +{ + struct mlx4_qp *qp; + int ret; + union { + struct mlx4_create_qp basic; + struct mlx4_exp_create_qp extended; + } cmd_obj; + union { + struct ibv_create_qp_resp basic; + struct ibv_exp_create_qp_resp extended; + } resp_obj; + struct mlx4_create_qp_base *cmd = NULL; + int ext_kernel_cmd = 0; + struct mlx4_bfs_data *bfs = &to_mctx(context)->bfs; + int i; + unsigned char cq_update; + int thread_safe = !mlx4_single_threaded; + int db_method_defined = 0; + + memset(&resp_obj, 0, sizeof(resp_obj)); + memset(&cmd_obj, 0, sizeof(cmd_obj)); + + if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_RESERVED1) { + errno = ENOSYS; + return NULL; + } + + if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) { + if (attr->srq) + attr->max_inl_recv = 0; + else + attr->max_inl_recv = min(attr->max_inl_recv, + (to_mctx(context)->max_sge * + sizeof(struct mlx4_wqe_data_seg))); + } + + /* Sanity check QP size before proceeding */ + if (verify_sizes(attr, to_mctx(context))) + return NULL; + + if (attr->qp_type == IBV_QPT_XRC && attr->recv_cq && + attr->cap.max_recv_wr > 0 && mlx4_trace) + fprintf(stderr, PFX "Warning: Legacy XRC sender should not use a recieve cq\n"); + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + qp->qp_cap_cache = 0; + if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS) + ext_kernel_cmd = 1; + if (attr->qp_type == IBV_QPT_XRC_RECV) { + attr->cap.max_send_wr = qp->sq.wqe_cnt = 0; + } else { + if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG && + attr->max_atomic_arg != 0) { + if (attr->max_atomic_arg == 8) { + qp->is_masked_atomic = 1; + } else { + fprintf(stderr, "%s: max_atomic_arg = %d is not valid for mlx4 (use 8 or 0)\n", + __FUNCTION__, attr->max_atomic_arg); + errno = EINVAL; + goto err; + } + } + + mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp); + /* + * We need to leave 2 KB + 1 WQE of headroom in the SQ to + * allow HW to prefetch. + */ +#ifdef MLX4_WQE_FORMAT + qp->sq_spare_wqes = 0; +#else + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; +#endif + qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes); + } + + if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND || + attr->qp_type == IBV_QPT_XRC_RECV || + attr->qp_type == IBV_QPT_XRC) { + attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0; + if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) + attr->max_inl_recv = 0; + } else { + qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); + if (attr->cap.max_recv_sge < 1) + attr->cap.max_recv_sge = 1; + if (attr->cap.max_recv_wr < 1) + attr->cap.max_recv_wr = 1; + } + + if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS) + qp->create_flags = attr->exp_create_flags & IBV_EXP_QP_CREATE_MASK; + + if (mlx4_exp_alloc_qp_buf(context, attr, qp)) + goto err; + + mlx4_init_qp_indices(qp); + + qp->sdb = (uint32_t *) (to_mctx(context)->uar + MLX4_SEND_DOORBELL); + if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_RES_DOMAIN) { + struct mlx4_res_domain *rd; + + if (!attr->res_domain) { + errno = EINVAL; + goto err_free; + } + rd = to_mres_domain(attr->res_domain); + if (rd->attr.thread_model == IBV_EXP_THREAD_UNSAFE || + rd->attr.thread_model == IBV_EXP_THREAD_SINGLE) + thread_safe = 0; + + if (rd->send_db) { + cmd_obj.extended.exp_cmd.uar_virt_add = send_db_to_uar((uintptr_t)rd->send_db->db_addr); + update_qp_bf_data(rd, qp, context); + db_method_defined = 1; + } + } + + if (mlx4_lock_init(&qp->sq.lock, thread_safe, mlx4_get_locktype())) + goto err_free; + if (mlx4_lock_init(&qp->rq.lock, thread_safe, mlx4_get_locktype())) + goto sq_lock_destroy; + + cmd = (ext_kernel_cmd ? + &cmd_obj.extended.exp_cmd.base : &cmd_obj.basic.base); + + if (attr->cap.max_recv_sge) { + qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); + if (!qp->db) + goto rq_lock_destroy; + + *qp->db = 0; + cmd->db_addr = (uintptr_t) qp->db; + } else { + cmd->db_addr = 0; + } + + cmd->buf_addr = (uintptr_t) qp->buf.buf; + cmd->log_sq_stride = qp->sq.wqe_shift; + for (cmd->log_sq_bb_count = 0; + qp->sq.wqe_cnt > 1 << cmd->log_sq_bb_count; + ++cmd->log_sq_bb_count) + ; /* nothing */ + cmd->sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */ + memset(cmd->reserved, 0, sizeof(cmd->reserved)); + + pthread_mutex_lock(&to_mctx(context)->qp_table_mutex); + ret = ibv_exp_cmd_create_qp(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, + ext_kernel_cmd ? + (void *)&cmd_obj.extended.ibv_cmd : + (void *)&cmd_obj.basic.ibv_cmd, + ext_kernel_cmd ? + sizeof(cmd_obj.extended.ibv_cmd) : + sizeof(cmd_obj.basic.ibv_cmd), + ext_kernel_cmd ? + sizeof(cmd_obj.extended.exp_cmd) : + sizeof(cmd_obj.basic.base), + ext_kernel_cmd ? + (void *)&resp_obj.extended : (void *)&resp_obj.basic, + ext_kernel_cmd ? + sizeof(resp_obj.extended) : + sizeof(resp_obj.basic), + 0, 0); + if (ret) { + errno = ret; + goto err_rq_db; + } + + if (qp->max_inlr_sg && (attr->max_inl_recv != (1 << qp->rq.wqe_shift))) + goto err_destroy; + + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { + ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp); + if (ret) + goto err_destroy; + } + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + + qp->rq.wqe_cnt = attr->cap.max_recv_wr; + qp->rq.max_gs = attr->cap.max_recv_sge; + + /* adjust rq maxima to not exceed reported device maxima */ + attr->cap.max_recv_wr = min(to_mctx(context)->max_qp_wr, + attr->cap.max_recv_wr); + attr->cap.max_recv_sge = min(to_mctx(context)->max_sge, + attr->cap.max_recv_sge); + + qp->rq.max_post = attr->cap.max_recv_wr; + if (attr->qp_type != IBV_QPT_XRC_RECV) + mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type); + + qp->doorbell_qpn = htonl(qp->verbs_qp.qp.qp_num << 8); + if (attr->sq_sig_all) + cq_update = MLX4_WQE_CTRL_CQ_UPDATE; + else + cq_update = 0; + + /* + * The rcrb_flags_tbl is a table to get the right value for the first + * byte of srcrb_flags field on the WQE ctrl segment. + * The value is derived from the QP sq_sig_all flag and the 4 WR flags + * IBV_EXP_SEND_SIGNALED, IBV_EXP_SEND_SOLICITED, IBV_EXP_SEND_IP_CSUM + * and IBV_EXP_SEND_TUNNEL. + * These flags used as an index to get the required value from the table. + * The IBV_EXP_SEND_SIGNALED flag defines first bit of the index the + * IBV_EXP_SEND_SOLICITED defines the second bit the IBV_EXP_SEND_IP_CSUM + * defines the third bit and IBV_EXP_SEND_TUNNEL the fourth one. + * Therefore to calculate the index we can use: + * idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED | + * (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) | + * (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2); + * (exp_send_flags & IBV_EXP_SEND_TUNNEL)/(IBV_EXP_SEND_TUNNEL >> 3); + */ + qp->srcrb_flags_tbl[0] = cq_update; + qp->srcrb_flags_tbl[1] = MLX4_WQE_CTRL_CQ_UPDATE | cq_update; + qp->srcrb_flags_tbl[2] = MLX4_WQE_CTRL_SOLICIT | cq_update; + qp->srcrb_flags_tbl[3] = MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update; + qp->srcrb_flags_tbl[4] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | cq_update; + qp->srcrb_flags_tbl[5] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | cq_update; + qp->srcrb_flags_tbl[6] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_SOLICIT | cq_update; + qp->srcrb_flags_tbl[7] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update; + qp->srcrb_flags_tbl[8] = cq_update; + qp->srcrb_flags_tbl[9] = MLX4_WQE_CTRL_CQ_UPDATE | cq_update; + qp->srcrb_flags_tbl[10] = MLX4_WQE_CTRL_SOLICIT | cq_update; + qp->srcrb_flags_tbl[11] = MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update; + qp->srcrb_flags_tbl[12] = MLX4_WQE_CTRL_IP_CSUM | cq_update; + qp->srcrb_flags_tbl[13] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | cq_update; + qp->srcrb_flags_tbl[14] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_SOLICIT | cq_update; + qp->srcrb_flags_tbl[15] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update; + + qp->qp_type = attr->qp_type; + + /* Set default value of cached RX csum flags to 0 */ + qp->cached_rx_csum_flags = 0; + /* Set transposed_rx_csum_flags to match the cached_rx_csum_flags = 0 */ + qp->transposed_rx_csum_flags = IBV_EXP_CQ_RX_OUTER_IPV6_PACKET; + + if (!db_method_defined && bfs->buf_size == 0) { + /* not using BF */ + qp->db_method = MLX4_QP_DB_METHOD_DB; + } else if (!db_method_defined) { + /* + * To gain performance the dedic_bf_free is first tested without taking + * the dedic_bf_lock. + */ + if (bfs->dedic_bf_free) { + mlx4_spin_lock(&bfs->dedic_bf_lock); + for (i = 0 ; i < bfs->num_dedic_bfs; i++) { + if (!bfs->dedic_bf_used[i]) { + /* using dedicated BF */ + qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF; + qp->bf = (union mlx4_bf *)(&bfs->dedic_bf[i]); + bfs->dedic_bf_used[i] = 1; + bfs->dedic_bf_free--; + break; + } + } + mlx4_spin_unlock(&bfs->dedic_bf_lock); + } + if (!qp->bf) { + /* using common BF */ + if (mlx4_single_threaded) + qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF; + else + qp->db_method = MLX4_QP_DB_METHOD_BF; + qp->bf = (union mlx4_bf *)(&bfs->cmn_bf); + } + if (qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF && + mlx4_single_threaded && (wc_auto_evict_size() == 64)) { + if (to_mctx(context)->prefer_bf) + qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB; + else + qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB; + } + qp->bf_buf_size = bfs->buf_size; + } + + qp->model_flags = thread_safe ? MLX4_QP_MODEL_FLAG_THREAD_SAFE : 0; + mlx4_update_post_send_one(qp); + qp->pattern = MLX4_QP_PATTERN; + + return &qp->verbs_qp.qp; + +err_destroy: + ibv_cmd_destroy_qp(&qp->verbs_qp.qp); + +err_rq_db: + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + if (attr->cap.max_recv_sge) + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db); + +rq_lock_destroy: + mlx4_lock_destroy(&qp->rq.lock); + +sq_lock_destroy: + mlx4_lock_destroy(&qp->sq.lock); + +err_free: + mlx4_dealloc_qp_buf(context, qp); + +err: + free(qp); + + return NULL; +} + +int mlx4_exp_query_device(struct ibv_context *context, + struct ibv_exp_device_attr *device_attr) +{ + struct ibv_exp_query_device cmd; + struct ibv_port_attr port_attr; + uint64_t raw_fw_ver; + int ret; + int i; + + ret = ibv_exp_cmd_query_device(context, device_attr, &raw_fw_ver, + &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (device_attr->exp_device_cap_flags & IBV_EXP_DEVICE_CROSS_CHANNEL) { + device_attr->comp_mask |= IBV_EXP_DEVICE_ATTR_CALC_CAP; + device_attr->calc_cap.data_types = (1ULL << IBV_EXP_CALC_DATA_TYPE_INT) | + (1ULL << IBV_EXP_CALC_DATA_TYPE_UINT) | + (1ULL << IBV_EXP_CALC_DATA_TYPE_FLOAT); + device_attr->calc_cap.data_sizes = (1ULL << IBV_EXP_CALC_DATA_SIZE_64_BIT); + device_attr->calc_cap.int_ops = (1ULL << IBV_EXP_CALC_OP_ADD) | + (1ULL << IBV_EXP_CALC_OP_BAND) | + (1ULL << IBV_EXP_CALC_OP_BXOR) | + (1ULL << IBV_EXP_CALC_OP_BOR); + device_attr->calc_cap.uint_ops = device_attr->calc_cap.int_ops; + device_attr->calc_cap.fp_ops = device_attr->calc_cap.int_ops; + } + device_attr->exp_device_cap_flags |= IBV_EXP_DEVICE_MR_ALLOCATE; + + if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS) && + (device_attr->exp_device_cap_flags & (IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT | + IBV_EXP_DEVICE_RX_CSUM_IP_PKT | + IBV_EXP_DEVICE_VXLAN_SUPPORT))) { + for (i = 0; i < device_attr->phys_port_cnt; i++) { + ret = mlx4_query_port(context, i + 1, &port_attr); + if (ret) + return ret; + + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + device_attr->exp_device_cap_flags &= ~(IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT | + IBV_EXP_DEVICE_RX_CSUM_IP_PKT | + IBV_EXP_DEVICE_VXLAN_SUPPORT); + break; + } + } + } + + return __mlx4_query_device( + raw_fw_ver, + (struct ibv_device_attr *)device_attr); +} + +int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_exp_port_attr *port_attr) +{ + /* Check that only valid flags were given */ + if (!(port_attr->comp_mask & IBV_EXP_QUERY_PORT_ATTR_MASK1) || + (port_attr->comp_mask & ~IBV_EXP_QUERY_PORT_ATTR_MASKS) || + (port_attr->mask1 & ~IBV_EXP_QUERY_PORT_MASK)) { + return EINVAL; + } + + /* Optimize the link type query */ + if (port_attr->comp_mask == IBV_EXP_QUERY_PORT_ATTR_MASK1) { + if (!(port_attr->mask1 & ~(IBV_EXP_QUERY_PORT_LINK_LAYER | + IBV_EXP_QUERY_PORT_CAP_FLAGS))) { + struct mlx4_context *mctx = to_mctx(context); + if (port_num <= 0 || port_num > MLX4_PORTS_NUM) + return EINVAL; + if (mctx->port_query_cache[port_num - 1].valid) { + if (port_attr->mask1 & + IBV_EXP_QUERY_PORT_LINK_LAYER) + port_attr->link_layer = + mctx-> + port_query_cache[port_num - 1]. + link_layer; + if (port_attr->mask1 & + IBV_EXP_QUERY_PORT_CAP_FLAGS) + port_attr->port_cap_flags = + mctx-> + port_query_cache[port_num - 1]. + caps; + return 0; + } + } + if (port_attr->mask1 & IBV_EXP_QUERY_PORT_STD_MASK) { + return mlx4_query_port(context, port_num, + &port_attr->port_attr); + } + } + + return EOPNOTSUPP; +} + +struct ibv_ah *mlx4_exp_create_ah(struct ibv_pd *pd, + struct ibv_exp_ah_attr *attr_ex) +{ + struct ibv_exp_port_attr port_attr; + struct ibv_ah *ah; + struct mlx4_ah *mah; + + port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1; + port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER; + + if (ibv_exp_query_port(pd->context, attr_ex->port_num, &port_attr)) + return NULL; + + ah = mlx4_create_ah_common(pd, (struct ibv_ah_attr *)attr_ex, + port_attr.link_layer); + + if (NULL == ah) + return NULL; + + mah = to_mah(ah); + + /* If vlan was given, check that we could use it */ + if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID && + attr_ex->vid <= 0xfff && + (0 == attr_ex->ll_address.len || + !(attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL))) + goto err; + + /* ll_address.len == 0 means no ll address given */ + if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL && + 0 != attr_ex->ll_address.len) { + if (LL_ADDRESS_ETH != attr_ex->ll_address.type || + port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) + /* mlx4 provider currently only support ethernet + * extensions */ + goto err; + + /* link layer is ethernet */ + if (6 != attr_ex->ll_address.len || + NULL == attr_ex->ll_address.address) + goto err; + + memcpy(mah->mac, attr_ex->ll_address.address, + attr_ex->ll_address.len); + + if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID && + attr_ex->vid <= 0xfff) { + mah->av.port_pd |= htonl(1 << 29); + mah->vlan = attr_ex->vid | + ((attr_ex->sl & 7) << 13); + } + } + + return ah; + +err: + free(ah); + return NULL; +} + +static struct mlx4_send_db_data *allocate_send_db(struct mlx4_context *ctx) +{ + struct mlx4_device *dev = to_mdev(ctx->ibv_ctx.device); + struct mlx4_send_db_data *send_db = NULL; + unsigned int uar_idx; + void *uar; + void *bfs; + int i; + + if (!ctx->max_ctx_res_domain || !ctx->bfs.buf_size) { + errno = EINVAL; + return NULL; + } + + mlx4_spin_lock(&ctx->send_db_lock); + if (!list_empty(&ctx->send_db_list)) { + send_db = list_entry(ctx->send_db_list.next, struct mlx4_send_db_data, list); + list_del(&send_db->list); + } + mlx4_spin_unlock(&ctx->send_db_lock); + + if (!send_db) { + /* Fill up more send_db objects */ + mlx4_spin_lock(&ctx->send_db_lock); + if ((ctx->send_db_num_uars + 1) * ctx->bf_regs_per_page >= ctx->max_ctx_res_domain) { + mlx4_spin_unlock(&ctx->send_db_lock); + errno = ENOMEM; + return NULL; + } + uar_idx = ctx->send_db_num_uars; + ctx->send_db_num_uars++; + mlx4_spin_unlock(&ctx->send_db_lock); + + uar = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED, + ctx->ibv_ctx.cmd_fd, + dev->page_size * (MLX4_IB_EXP_MMAP_EXT_UAR_PAGE | + (uar_idx << MLX4_MMAP_CMD_BITS))); + if (uar == MAP_FAILED) + return NULL; + bfs = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED, + ctx->ibv_ctx.cmd_fd, + dev->page_size * (MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE | + (uar_idx << MLX4_MMAP_CMD_BITS))); + if (bfs == MAP_FAILED) { + munmap(uar, dev->page_size); + return NULL; + } + mlx4_spin_lock(&ctx->send_db_lock); + for (i = 0; i < ctx->bf_regs_per_page; i++) { + send_db = calloc(1, sizeof(*send_db)); + if (!send_db) { + if (i) + break; + mlx4_spin_unlock(&ctx->send_db_lock); + errno = ENOMEM; + return NULL; + } + + mlx4_lock_init(&send_db->bf.cmn.lock, + !mlx4_single_threaded, + mlx4_get_locktype()); + + send_db->db_addr = uar_to_send_db((uintptr_t)uar); + + /* Allocate a pair of blue-flames to toggle sends between them */ + send_db->bf.cmn.address = bfs + (i * ctx->bfs.buf_size * 2); + list_add(&send_db->list, &ctx->send_db_list); + } + + /* Return the last send_db object to the caller */ + list_del(&send_db->list); + mlx4_spin_unlock(&ctx->send_db_lock); + } + + return send_db; +} + +static void free_send_db(struct mlx4_context *ctx, + struct mlx4_send_db_data *send_db) +{ + mlx4_spin_lock(&ctx->send_db_lock); + list_add(&send_db->list, &ctx->send_db_list); + mlx4_spin_unlock(&ctx->send_db_lock); +} + +struct ibv_exp_res_domain *mlx4_exp_create_res_domain(struct ibv_context *context, + struct ibv_exp_res_domain_init_attr *attr) +{ + struct mlx4_context *ctx = to_mctx(context); + struct mlx4_res_domain *res_domain; + + if (attr->comp_mask >= IBV_EXP_RES_DOMAIN_RESERVED) { + errno = EINVAL; + return NULL; + } + + res_domain = calloc(1, sizeof(*res_domain)); + if (!res_domain) { + errno = ENOMEM; + return NULL; + } + + res_domain->ibv_res_domain.context = context; + + /* set default values */ + res_domain->attr.thread_model = IBV_EXP_THREAD_SAFE; + res_domain->attr.msg_model = IBV_EXP_MSG_DEFAULT; + /* get requested valid values */ + if (attr->comp_mask & IBV_EXP_RES_DOMAIN_THREAD_MODEL) + res_domain->attr.thread_model = attr->thread_model; + if (attr->comp_mask & IBV_EXP_RES_DOMAIN_MSG_MODEL) + res_domain->attr.msg_model = attr->msg_model; + res_domain->attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL | + IBV_EXP_RES_DOMAIN_MSG_MODEL; + /* + * Allocate BF for every resource domain since BF is improving + * both BW and latency of single message. + */ + res_domain->send_db = allocate_send_db(ctx); + + /* define resource domain type */ + if (!res_domain->send_db) { + if (res_domain->attr.msg_model == IBV_EXP_MSG_FORCE_LOW_LATENCY) + /* + * Fail in case user asked for force low-latency + * resource-domain but we can't allocate + * dedicated BF. + */ + goto err; + else + /* + * Dedicated BF is not allocated for the + * resource-domain. + */ + res_domain->type = MLX4_RES_DOMAIN_BF_NONE; + } else { + /* + * In case dedicated BF allocated set the + * resource-domain type according to the + * thread-model + */ + switch (res_domain->attr.thread_model) { + case IBV_EXP_THREAD_SAFE: + res_domain->type = MLX4_RES_DOMAIN_BF_SAFE; + break; + case IBV_EXP_THREAD_UNSAFE: + res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE; + break; + case IBV_EXP_THREAD_SINGLE: + if (wc_auto_evict_size() == 64) + res_domain->type = MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT; + else + res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE; + break; + } + } + + return &res_domain->ibv_res_domain; + +err: + free(res_domain); + + return NULL; +} + +int mlx4_exp_destroy_res_domain(struct ibv_context *context, + struct ibv_exp_res_domain *res_dom, + struct ibv_exp_destroy_res_domain_attr *attr) +{ + struct mlx4_res_domain *res_domain = to_mres_domain(res_dom); + + if (res_domain->send_db) + free_send_db(to_mctx(context), res_domain->send_db); + + free(res_domain); + + return 0; +} + +void *mlx4_exp_query_intf(struct ibv_context *context, struct ibv_exp_query_intf_params *params, + enum ibv_exp_query_intf_status *status) +{ + void *family = NULL; + struct mlx4_qp *qp; + struct mlx4_cq *cq; + + *status = IBV_EXP_INTF_STAT_OK; + + if (!params->obj) { + errno = EINVAL; + *status = IBV_EXP_INTF_STAT_INVAL_OBJ; + + return NULL; + } + + if (params->intf_version > MLX4_MAX_FAMILY_VER) { + *status = IBV_EXP_INTF_STAT_VERSION_NOT_SUPPORTED; + + return NULL; + } + + switch (params->intf) { + case IBV_EXP_INTF_QP_BURST: + qp = to_mqp(params->obj); + if (qp->pattern == MLX4_QP_PATTERN) { + family = mlx4_get_qp_burst_family(qp, params, status); + if (*status != IBV_EXP_INTF_STAT_OK) { + fprintf(stderr, PFX "Failed to get QP burst family\n"); + errno = EINVAL; + } + } else { + fprintf(stderr, PFX "Warning: non-valid QP passed to query interface\n"); + *status = IBV_EXP_INTF_STAT_INVAL_OBJ; + errno = EINVAL; + } + break; + + case IBV_EXP_INTF_CQ: + cq = to_mcq(params->obj); + if (cq->pattern == MLX4_CQ_PATTERN) { + family = (void *)mlx4_get_poll_cq_family(cq, params, status); + } else { + fprintf(stderr, PFX "Warning: non-valid CQ passed to query interface\n"); + *status = IBV_EXP_INTF_STAT_INVAL_OBJ; + errno = EINVAL; + } + break; + + default: + *status = IBV_EXP_INTF_STAT_INTF_NOT_SUPPORTED; + errno = EINVAL; + } + + return family; +} + +int mlx4_exp_release_intf(struct ibv_context *context, void *intf, + struct ibv_exp_release_intf_params *params) +{ + return 0; +} Index: contrib/ofed/libmlx4/src/wqe.h =================================================================== --- contrib/ofed/libmlx4/src/wqe.h +++ contrib/ofed/libmlx4/src/wqe.h @@ -38,9 +38,19 @@ }; enum { - MLX4_WQE_CTRL_FENCE = 1 << 6, - MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, - MLX4_WQE_CTRL_SOLICIT = 1 << 1, + MLX4_WQE_CTRL_FENCE = 1 << 6, + MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, + MLX4_WQE_CTRL_SOLICIT = 1 << 1, + MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7, + MLX4_WQE_CTRL_IIP = 1 << 28, + MLX4_WQE_CTRL_IL4 = 1 << 27, + MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5, + MLX4_WQE_CTRL_IP_CSUM = 1 << 4, +}; + +enum { + MLX4_WQE_BIND_TYPE_2 = (1<<31), + MLX4_WQE_BIND_ZERO_BASED = (1<<30), }; enum { @@ -54,8 +64,7 @@ struct mlx4_wqe_ctrl_seg { uint32_t owner_opcode; - uint16_t vlan_tag; - uint8_t ins_vlan; + uint8_t reserved[3]; uint8_t fence_size; /* * High 24 bits are SRC remote buffer; low 8 bits are flags: @@ -66,7 +75,10 @@ * [1] SE (solicited event) * [0] FL (force loopback) */ - uint32_t xrcrb_flags; + union { + uint32_t srcrb_flags; + uint16_t srcrb_flags16[2]; + }; /* * imm is immediate data for send/RDMA write w/ immediate; * also invalidation key for send with invalidate; input @@ -99,6 +111,19 @@ uint32_t reserved2[3]; }; +struct mlx4_wqe_local_inval_seg { + uint64_t reserved1; + uint32_t mem_key; + uint32_t reserved2; + uint64_t reserved3[2]; +}; + +enum { + MLX4_WQE_MW_REMOTE_READ = 1 << 29, + MLX4_WQE_MW_REMOTE_WRITE = 1 << 30, + MLX4_WQE_MW_ATOMIC = 1 << 31 +}; + struct mlx4_wqe_raddr_seg { uint64_t raddr; uint32_t rkey; @@ -110,6 +135,13 @@ uint64_t compare; }; +struct mlx4_wqe_masked_atomic_seg { + uint64_t swap_data; + uint64_t cmp_data; + uint64_t swap_mask; + uint64_t cmp_mask; +}; + struct mlx4_wqe_bind_seg { uint32_t flags1; uint32_t flags2; @@ -119,4 +151,11 @@ uint64_t length; }; +struct mlx4_wqe_wait_en_seg { + uint32_t valid; + uint32_t resv; + uint32_t pi; + uint32_t obj_num; +}; + #endif /* WQE_H */ Index: contrib/ofed/usr.lib/libmlx4/Makefile =================================================================== --- contrib/ofed/usr.lib/libmlx4/Makefile +++ contrib/ofed/usr.lib/libmlx4/Makefile @@ -14,7 +14,7 @@ SHLIB_MAJOR= 1 MK_PROFILE= no -SRCS= buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c +SRCS= buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c verbs_exp.c LIBADD= ibverbs pthread CFLAGS+= -DHAVE_CONFIG_H Index: contrib/ofed/usr.lib/libmlx4/config.h =================================================================== --- contrib/ofed/usr.lib/libmlx4/config.h +++ contrib/ofed/usr.lib/libmlx4/config.h @@ -1,4 +1,3 @@ -#define HAVE_IBV_DONTFORK_RANGE -#define HAVE_IBV_DOFORK_RANGE -#define HAVE_IBV_REGISTER_DRIVER -#define HAVE_IBV_READ_SYSFS_FILE +#define HAVE_IBV_DOFORK_RANGE 1 +#define HAVE_IBV_DONTFORK_RANGE 1 +#define HAVE_IBV_REGISTER_DRIVER 1