Page MenuHomeFreeBSD

D5793.diff
No OneTemporary

D5793.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: contrib/ofed/libmlx4/Makefile.am
===================================================================
--- contrib/ofed/libmlx4/Makefile.am
+++ contrib/ofed/libmlx4/Makefile.am
@@ -1,12 +1,19 @@
-AM_CFLAGS = -g -Wall -D_GNU_SOURCE
+AM_CFLAGS = -g -Wall -Werror -D_GNU_SOURCE
mlx4_version_script = @MLX4_VERSION_SCRIPT@
MLX4_SOURCES = src/buf.c src/cq.c src/dbrec.c src/mlx4.c src/qp.c \
- src/srq.c src/verbs.c
+ src/srq.c src/verbs.c src/verbs_exp.c
+noinst_HEADERS = src/bitmap.h src/doorbell.h src/list.h src/mlx4-abi.h src/mlx4_exp.h src/mlx4.h src/wqe.h
if HAVE_IBV_DEVICE_LIBRARY_EXTENSION
- lib_LTLIBRARIES = src/libmlx4.la
+ lib_LTLIBRARIES =
+else
+ mlx4lib_LTLIBRARIES =
+endif
+
+if HAVE_IBV_DEVICE_LIBRARY_EXTENSION
+ lib_LTLIBRARIES += src/libmlx4.la
src_libmlx4_la_SOURCES = $(MLX4_SOURCES)
src_libmlx4_la_LDFLAGS = -avoid-version -release @IBV_DEVICE_LIBRARY_EXTENSION@ \
$(mlx4_version_script)
@@ -14,13 +21,14 @@
mlx4conf_DATA = mlx4.driver
else
mlx4libdir = $(libdir)/infiniband
- mlx4lib_LTLIBRARIES = src/mlx4.la
+ mlx4lib_LTLIBRARIES += src/mlx4.la
src_mlx4_la_SOURCES = $(MLX4_SOURCES)
src_mlx4_la_LDFLAGS = -avoid-version -module $(mlx4_version_script)
endif
-EXTRA_DIST = src/doorbell.h src/mlx4.h src/mlx4-abi.h src/wqe.h \
- src/mlx4.map libmlx4.spec.in mlx4.driver
+EXTRA_DIST = src/mlx4.map libmlx4.spec.in mlx4.driver
+EXTRA_DIST += debian
+EXTRA_DIST += autogen.sh
dist-hook: libmlx4.spec
cp libmlx4.spec $(distdir)
Index: contrib/ofed/libmlx4/autogen.sh
===================================================================
--- contrib/ofed/libmlx4/autogen.sh
+++ contrib/ofed/libmlx4/autogen.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#! /bin/sh -eE
set -x
aclocal -I config
Index: contrib/ofed/libmlx4/configure.ac
===================================================================
--- contrib/ofed/libmlx4/configure.ac
+++ contrib/ofed/libmlx4/configure.ac
@@ -1,12 +1,15 @@
dnl Process this file with autoconf to produce a configure script.
AC_PREREQ(2.57)
-AC_INIT(libmlx4, 1.0, general@lists.openfabrics.org)
+AC_INIT(libmlx4, 1.0.6mlnx1, linux-rdma@vger.kernel.org)
AC_CONFIG_SRCDIR([src/mlx4.h])
AC_CONFIG_AUX_DIR(config)
-AM_CONFIG_HEADER(config.h)
-AM_INIT_AUTOMAKE(libmlx4, 1.0)
-AM_PROG_LIBTOOL
+AC_CONFIG_HEADER(config.h)
+AM_INIT_AUTOMAKE([1.10 foreign tar-ustar silent-rules subdir-objects])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AC_PROG_LIBTOOL
+LT_INIT
AC_ARG_WITH([valgrind],
AC_HELP_STRING([--with-valgrind],
@@ -21,6 +24,13 @@
fi
fi
+#--with-wqe-format
+AC_ARG_WITH([wqe-format],
+ AC_HELP_STRING([--with-wqe-format],
+ [Enable wqe-format annotations (default NO)]),
+ AC_DEFINE([MLX4_WQE_FORMAT], 1, [Define to 1 to enable wqe-foramt annotations.]),
+)
+
dnl Checks for programs
AC_PROG_CC
@@ -32,22 +42,19 @@
AC_CHECK_HEADER(infiniband/driver.h, [],
AC_MSG_ERROR([<infiniband/driver.h> not found. libmlx4 requires libibverbs.]))
AC_HEADER_STDC
-AC_CHECK_HEADER(valgrind/memcheck.h,
- [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1,
- [Define to 1 if you have the <valgrind/memcheck.h> header file.])],
- [if test $want_valgrind = yes; then
- AC_MSG_ERROR([Valgrind memcheck support requested, but <valgrind/memcheck.h> not found.])
- fi])
+
+if test x$want_valgrind = xyes; then
+ AC_CHECK_HEADER(valgrind/memcheck.h,
+ [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1,
+ [Define to 1 if you have the <valgrind/memcheck.h> header file.])],
+ [if test $want_valgrind = yes; then
+ AC_MSG_ERROR([Valgrind memcheck support requested, but <valgrind/memcheck.h> not found.])
+ fi])
+fi
dnl Checks for typedefs, structures, and compiler characteristics.
AC_C_CONST
AC_CHECK_SIZEOF(long)
-AC_CHECK_MEMBER(struct ibv_context.more_ops,
- [AC_DEFINE([HAVE_IBV_MORE_OPS], 1, [Define to 1 if more_ops is a member of ibv_context])],,
- [#include <infiniband/verbs.h>])
-AC_CHECK_MEMBER(struct ibv_more_ops.create_xrc_srq,
- [AC_DEFINE([HAVE_IBV_XRC_OPS], 1, [Define to 1 if have xrc ops])],,
- [#include <infiniband/verbs.h>])
dnl Checks for library functions
AC_CHECK_FUNC(ibv_read_sysfs_file, [],
Index: contrib/ofed/libmlx4/debian/changelog
===================================================================
--- contrib/ofed/libmlx4/debian/changelog
+++ contrib/ofed/libmlx4/debian/changelog
@@ -1,8 +1,201 @@
-libmlx4 (1.0-2) unstable; urgency=low
+libmlx4 (1.0.6mlnx1-1) unstable; urgency=low
- * Add debian/watch file
+ * libmlx4: Fix MR address change in rereg_mr
+ * libmlx4: revert the endianess fix for immediate data
+ * libmlx4: split post_send_one to qp types
+ * libmlx4: Add post_send_one to qp struct
+ * libmlx4: remove inl from basic set_data_seg functions
+ * libmlx4: Set data segment in one function
+ * libmlx4: set ctrl segment in one funtion
+ * libmlx4: use htonl when copy immediate data to WQE
+ * libmlx4: fix bug in bf_buf_size update
+ * libmlx4: Define set_data_seg as inline function
+ * libmlx4: reduce cache used by datapath
+ * libmlx4: optimize wq_overflow
+ * libmlx4: Add anothe DB ringing method
+ * libmlx4: Use x86_64 SSE2 instructions to improve bf_copy
+ * libmlx4: Add new DB ringing mode
+ * libmlx4: use all 8 BFs
+ * libmlx4: split ring_db function
+ * libmlx4: add door-bell ring function
+ * Modify call from ibv_exp_getenv to ibv_exp_cmd_getenv
+ * libmlx4: fix contiguous page registration
+ * Modify to use verbs specific getenv
+ * libmlx4: avoid creating AH with DLID 0
+ * libmlx4: fixed resize cq overrun bug
+ * libmlx4.spec.in: Changed valgrind libs DESTDIR
+ * Added valgrind support
+ * fixed and added valgrind Macros
+ * Adding experimental dereg_mr support
+ * shared_mr: handle duplication from glob/procfs
+ * shared_mr: fine-tuned counter mode name
+ * fix 32 bit compile warning
+ * shared mr with counter name support
+ * libmlx4: allow user to specify the addr of contig pages.
+ * libmlx4: avoid using gettimeofday in mlx4_reg_shared_mr.
+ * libmlx4: init exp_mw_bind.
+ * libmlx4: added -Werror to Makefile
+ * ibmlx4: Use masked atomics only if max_atomic_arg defined
+ * wc_flags should be set even when using experimental verbs
+ * libmlx4: return errno on ibv_post_srq_recv
+ * libmlx4: Retry open shared mr file
+ * libmlx4: Add completion opcodes for masked atomic operations
+ * Verify hop_limit > 1 in create_ah
+ * libmlx4.spec.in: Support configure_options flag.
+ * configure: Update AM_INIT_AUTOMAKE to support new auto tools.
+ * Add MR re-registeration
+ * mlx4: Add support for timestamping when initiating context.
+ * libmlx4: Do not publish support for IBV_CALC_OP_MAXLOC
+ * Fix comp_mask handling in ibv_exp_query_values
+ * libmlx4: Simplify extended atomics API
+ * libmlx4: Fix wrong wqe pointer advance
+ * libmlx4: Add support for masked atomics
+ * Revert "libmlx4: Fix log function to avoid overflow"
+ * libmlx4: add ibv_exp_modify_qp to mlx4
+ * libmlx4: Fix overflow on flag mask
+ * libmlx4: Fix log function to avoid overflow
+ * libmlx4: improve experimental interface
+ * A correct AH was free'd by mistake
+ * Align create_ah_ex and query_port_ex to upstream
+ * Change imm_data to ex.imm_data or ex.invalidate_rkey
+ * libmlx4: change wc_size from int to uint32_t.
+ * libmlx4: Print prefer_bf message only in trace mode.
+ * libmlx4: separate mlx4_post_send to EXP & NON EXP
- -- Roland Dreier <rolandd@cisco.com> Wed, 12 Mar 2008 10:40:19 -0700
+ -- Vladimir Sokolovsky <vlad@mellanox.com> Wed, 10 Dec 2014 10:53:10 +0200
+
+libmlx4 (1.0.5mlnx1-1) unstable; urgency=low
+
+ * resize_cq: fix possible endless loop scanning CQ
+ * User QP/SRQ in work completion
+ * libmlx4: Align verbs interface with upstream
+ * libmlx4: add ibv_exp_reg_mr experimental verb
+ * libmlx4: Change legacy extended verbs to experimental verbs
+ * libmlx4: Change legacy extended uverbs to experimental uverbs
+ * unmap hca_clock_page in mlx4_uninit_context
+ * Enable contigous pages for Control resources by default
+ * New experimental verbs for query_port
+ * Added htobe64 definition which is missing on SLES10
+ * Fix QoS issues for UD QPs
+ * Allocate zoeroized memory for CQ
+ * libmlx4: Change sandy bridge work around algorithm
+ * libmlx4: add debian to EXTRA_DIST
+ * libmlx4: add support for "git review" command line gerrit tool
+ * libmlx4: Fix "make distcheck"
+ * Add allowed_wc_flags
+ * libmlx4: Fix valgrind errors.
+ * Raw IB QP fix
+ * libmlx4: Change inline receive interface
+ * Revert "move flow steering to experimental verbs"
+ * move flow steering to experimental verbs
+ * libmlx4: resolve segfault on ibv_xsrq_pingpong
+ * Raw Eth QP - prevent loopback on SRIOV
+ * libmlx4: remove struct ts and use direct field timestamp
+ * Fix compilation issue due to shifting bind_mw struct in ib_send_wr
+ * libmlx4: Add experimental inline receive
+ * Double check in order to prevent division by zero.
+ * Add a missing check for a value of a certain variable
+ * libmlx4 - qp: optimize single segment case around set_data_seg()
+ * libmlx4 - Inform GCC about hotspot functions so those can be optimized more aggressively.
+ * libmlx4 - Add branch prediction helpers to qp and cq data path functions.
+ * libmlx4 - Using unsigned indices allow GCC to generate a bit more efficient code.
+ * IP based addressing support
+ * Implementing verbs bind_mw (for binding type 1 memory windows)
+ * Adding support to post bind (type 2) memory windows
+ * Adding support to post invalidate messages
+ * Implementing verbs alloc_mw and dealloc_mw
+ * Adding work completions that are related to memory windows
+ * fix incorrect timestamp
+ * add a workaround for hw bug in hwclock wraparound
+ * extension verb: mlx4_query_values are reading hwclock
+ * extension verb: mlx4_query_device_ex
+ * extension verb: mlx4_create_cq_ex
+ * implement ibv_poll_cq_ex extension verb
+ * XRC - move warning to be under trace mode
+ * XRC - fix leak in legacy flow
+ * libmlx4 : Globaly avoid spinlocks for multithreaded apps
+ * Handle missing symbols in Xen server 6.1
+ * libmlx4: Cache link layer's type in mlx4_context. Caching will allow us to avoid ibv_query_port calls and save time in ibv_create_ah.
+ * XRC - sync to latest upstream changes
+ * XRC issues
+ * libmlx4: XRC binary compat layer
+
+ -- Vladimir Sokolovsky <vlad@mellanox.com> Sun, 23 Mar 2014 14:16:10 +0200
+
+libmlx4 (1.0.4mlnx2-1) unstable; urgency=low
+
+ * libmlx4: Add Cross-channel capability
+ * libmlx4: Add mlx4_post_task
+ * libmlx4: Add mlx4_query_device_ex
+ * libmlx4: Add mlx4_modify_cq
+ * libmlx4: Support Cross-channel capability in mlx4_create_qp_ex
+ * libmlx4: Add new fields and opcodes to support Cross-channel
+ * libmlx4: Remove legacy mverbs code
+ * libmlx4: Add support for XRC QPs
+ * libmlx4: contig pages over 4GB
+ * stall code to be run only on x86
+ * Implement ibv_create_flow and ibv_destroy_flow
+ * Revert "Add support for ibv_attach_flow and ibv_detach_flow."
+ * libmlx4 fix compilation warnings
+ * Handle 0-length s/g list entries correctly
+ * libmlx4.spec.in: Fix %files macro
+ * configure: disable mverbs by default
+ * libmlx4: verbs extensions breaks MVERBS implementation
+ * shared_mr support on top of verbs extension
+ * libmlx4: Infra-structure changes to support verbs extensions
+ * fixed an issue with definition of container_of
+ * Revert "verbs extension mechanism based on Sean first patch"
+
+ -- Vladimir Sokolovsky <vlad@mellanox.com> Mon, 7 Jan 2013 13:38:10 +0200
+
+libmlx4 (1.0.4mlnx1-1) unstable; urgency=low
+
+ * New Mellanox release.
+
+ -- Vladimir Sokolovsky <vlad@mellanox.com> Mon, 7 Jan 2013 13:38:10 +0200
+
+libmlx4 (1.0.4-1) unstable; urgency=low
+
+ * New upstream release.
+ - IBoE multicast support.
+ * Update maintainer and remove DM-Upload-Allowed now that I'm a DD.
+
+ -- Roland Dreier <rbd@debian.org> Wed, 28 Mar 2012 10:31:52 -0700
+
+libmlx4 (1.0.3-1) unstable; urgency=low
+
+ * New upstream release.
+ - Add ConnectX-3 support.
+ - Add IBoE support.
+ * Since we have plugin in /usr/lib/libibverbs, we need to depend on
+ libibverbs (>= 1.1.3).
+
+ -- Roland Dreier <roland@digitalvampire.org> Wed, 06 Jul 2011 23:54:24 -0700
+
+libmlx4 (1.0.2-1) unstable; urgency=low
+
+ * New upstream release.
+ - Fix potential problems running under Valgrind.
+ - Add support for resize CQ operation.
+ - Fix other minor bugs.
+ * Update maintainer and set DM-Upload-Allowed to yes. (Closes: #632108)
+ * Switch to dpkg-source 3.0 (quilt) format.
+ * Acknowledge NMU (Closes: #621664).
+ * Change build system from cdbs to debhelper 7.
+ * Use libibverbs 1.1.3 feature to move plugin to /usr/lib/libibverbs
+ to fix multiple problems with a not-exactly-shlib in /usr/lib.
+ * Add debian/watch file.
+ * Move -dbg package to section debug.
+ * Update to Standards-Version: 3.9.2.
+
+ -- Roland Dreier <roland@digitalvampire.org> Wed, 06 Jul 2011 13:32:18 -0700
+
+libmlx4 (1.0-1.1) unstable; urgency=low
+
+ * Non-maintainer upload.
+ * Don't ship .la files (Closes: #621664).
+
+ -- Luk Claes <luk@debian.org> Fri, 01 Jul 2011 19:09:59 +0200
libmlx4 (1.0-1) unstable; urgency=low
Index: contrib/ofed/libmlx4/debian/compat
===================================================================
--- contrib/ofed/libmlx4/debian/compat
+++ contrib/ofed/libmlx4/debian/compat
@@ -1 +1 @@
-5
+7
Index: contrib/ofed/libmlx4/debian/control
===================================================================
--- contrib/ofed/libmlx4/debian/control
+++ contrib/ofed/libmlx4/debian/control
@@ -1,16 +1,16 @@
Source: libmlx4
Priority: extra
-Maintainer: Roland Dreier <rolandd@cisco.com>
-Build-Depends: @cdbs@, libibverbs-dev (>= 1.0)
-Standards-Version: 3.7.3
+Maintainer: Roland Dreier <rbd@debian.org>
+Build-Depends: debhelper (>= 7.0.50~), dpkg-dev (>= 1.13.19), libibverbs-dev (>= 1.1.3)
+Standards-Version: 3.9.2
Section: libs
Homepage: http://www.openfabrics.org/
Package: libmlx4-1
Section: libs
Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: A userspace driver for Mellanox ConnectX InfiniBand HCAs
+Depends: ${shlibs:Depends}, ${misc:Depends}, libibverbs1 (>= 1.1.3)
+Description: Userspace driver for Mellanox ConnectX InfiniBand HCAs
libmlx4 is a device-specific driver for Mellanox ConnectX InfiniBand
host channel adapters (HCAs) for the libibverbs library. This allows
userspace processes to access Mellanox HCA hardware directly with
@@ -32,7 +32,7 @@
directly to an application, which may be useful for debugging.
Package: libmlx4-1-dbg
-Section: libdevel
+Section: debug
Priority: extra
Architecture: any
Depends: ${misc:Depends}, libmlx4-1 (= ${binary:Version})
Index: contrib/ofed/libmlx4/debian/libmlx4-1.install
===================================================================
--- contrib/ofed/libmlx4/debian/libmlx4-1.install
+++ contrib/ofed/libmlx4/debian/libmlx4-1.install
@@ -1,2 +1,2 @@
-usr/lib/libmlx4-rdmav2.so
+usr/lib/libmlx4-rdmav2.so /usr/lib/libibverbs/
etc/libibverbs.d/mlx4.driver
Index: contrib/ofed/libmlx4/debian/libmlx4-dev.install
===================================================================
--- contrib/ofed/libmlx4/debian/libmlx4-dev.install
+++ contrib/ofed/libmlx4/debian/libmlx4-dev.install
@@ -1 +1 @@
-usr/lib/libmlx4.{a,la}
+usr/lib/libmlx4.a
Index: contrib/ofed/libmlx4/debian/rules
===================================================================
--- contrib/ofed/libmlx4/debian/rules
+++ contrib/ofed/libmlx4/debian/rules
@@ -1,8 +1,10 @@
#!/usr/bin/make -f
# -*- mode: makefile; coding: utf-8 -*-
-DEB_DH_INSTALL_SOURCEDIR := debian/tmp
-DEB_AUTO_UPDATE_LIBTOOL := post
+%:
+ dh $@
-include /usr/share/cdbs/1/rules/debhelper.mk
-include /usr/share/cdbs/1/class/autotools.mk
+override_dh_strip:
+ dh_strip --dbg-package=libmlx4-1-dbg
+
+override_dh_makeshlibs:
Index: contrib/ofed/libmlx4/libmlx4.spec.in
===================================================================
--- contrib/ofed/libmlx4/libmlx4.spec.in
+++ contrib/ofed/libmlx4/libmlx4.spec.in
@@ -1,15 +1,27 @@
+%{!?_with_valgrind: %define _with_valgrind 0}
+%{!?_disable_valgrind: %define _disable_valgrind 0}
+
+%if 0%{?rhel} == 6
+%if 0%{_disable_valgrind} == 0
+%define _with_valgrind 1
+%endif
+%endif
+
Name: libmlx4
-Version: 1.0
-Release: 2%{?dist}
+Version: 1.0.6mlnx1
+Release: 1%{?dist}
Summary: Mellanox ConnectX InfiniBand HCA Userspace Driver
Group: System Environment/Libraries
License: GPLv2 or BSD
Url: http://openfabrics.org/
-Source: http://openfabrics.org/downloads/mlx4/libmlx4-1.0.tar.gz
+Source: http://openfabrics.org/downloads/mlx4/libmlx4-%{version}.tar.gz
BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
-BuildRequires: libibverbs-devel >= 1.1-0.1.rc2
+BuildRequires: libibverbs-devel >= 1.1.6mlnx2
+%if %{_with_valgrind}
+BuildRequires: valgrind-devel
+%endif
%description
libmlx4 provides a device-specific userspace driver for Mellanox
@@ -29,12 +41,24 @@
%setup -q -n %{name}-@VERSION@
%build
-%configure
+%if %{_with_valgrind}
+%configure %{?configure_options} --libdir=%{_libdir}/mlnx_ofed/valgrind --with-valgrind
+make %{?_smp_mflags}
+make DESTDIR=$RPM_BUILD_DIR/%{name}-%{version}/valgrind install
+rm -f $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind/*.*a
+make clean
+%endif
+
+%configure %{?configure_options}
make %{?_smp_mflags}
%install
rm -rf $RPM_BUILD_ROOT
make DESTDIR=%{buildroot} install
+%if %{_with_valgrind}
+mkdir -p %{buildroot}/%{_libdir}/mlnx_ofed
+cp -a $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind %{buildroot}/%{_libdir}/mlnx_ofed
+%endif
# remove unpackaged files from the buildroot
rm -f $RPM_BUILD_ROOT%{_libdir}/*.la $RPM_BUILD_ROOT%{_libdir}/libmlx4.so
@@ -43,15 +67,34 @@
%files
%defattr(-,root,root,-)
-%{_libdir}/libmlx4-rdmav2.so
+%{_libdir}/libmlx4*.so
+%if %{_with_valgrind}
+%{_libdir}/mlnx_ofed/valgrind/libmlx4*.so
+%endif
%{_sysconfdir}/libibverbs.d/mlx4.driver
%doc AUTHORS COPYING README
%files devel
%defattr(-,root,root,-)
-%{_libdir}/libmlx4.a
+%{_libdir}/libmlx4*.a
%changelog
+* Mon Mar 28 2012 Roland Dreier <roland@digitalvampire.org> - 1.0.4-1
+- New upstream release
+
+* Mon Mar 26 2012 Roland Dreier <roland@digitalvampire.org> - 1.0.3-1
+- New upstream release
+
+* Wed Jul 6 2011 Roland Dreier <roland@digitalvampire.org> - 1.0.2-1
+- New upstream release
+
+* Wed Jun 17 2009 Roland Dreier <rdreier@cisco.com> - 1.0.1-1
+- New upstream release
+- Change openib.org URLs to openfabrics.org URLs
+
+* Wed Feb 25 2009 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1.0-3
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild
+
* Sun Jan 27 2008 Roland Dreier <rdreier@cisco.com> - 1.0-2
- Spec file cleanups, based on Fedora review: don't mark
libmlx4.driver as a config file, since it is not user modifiable,
Index: contrib/ofed/libmlx4/src/bitmap.h
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/bitmap.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2000, 2011 Mellanox Technology Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef BITMAP_H
+#define BITMAP_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#ifndef min
+#define min(a, b) \
+ ({ typeof(a) _a = (a); \
+ typeof(b) _b = (b); \
+ _a < _b ? _a : _b; })
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define MLX4_SHM_ADDR (void *)(0x8000000000000000UL)
+#define MLX4_SHMAT_FLAGS (SHM_RND)
+#else
+#define MLX4_SHM_ADDR (void *)(0x0UL)
+#define MLX4_SHMAT_FLAGS (0)
+#endif
+
+struct __dummy_h { unsigned long a[100]; };
+#define MLX4_ADDR (*(struct __dummy_h *) addr)
+#define MLX4_CONST_ADDR (*(const struct __dummy_h *) addr)
+
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#define BITS_PER_BYTE 8
+#define BITS_PER_WORD (BITS_PER_BYTE * sizeof(uint32_t))
+#define BITS_TO_WORDS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(uint32_t))
+
+#ifndef HPAGE_SIZE
+#define HPAGE_SIZE (2UL*1024*1024)
+#endif
+
+#define MLX4_SHM_LENGTH (HPAGE_SIZE)
+#define MLX4_Q_CHUNK_SIZE 32768
+#define MLX4_SHM_NUM_REGION 64
+
+struct mlx4_bitmap {
+ uint32_t last;
+ uint32_t top;
+ uint32_t max;
+ uint32_t avail;
+ uint32_t mask;
+ struct mlx4_spinlock lock;
+ uint32_t *table;
+};
+
+inline unsigned long mlx4_ffz(uint32_t word)
+{
+ return __builtin_ffs(~word) - 1;
+}
+
+inline void mlx4_set_bit(unsigned int nr, uint32_t *addr)
+{
+
+ addr[(nr / BITS_PER_WORD)]
+ |= (1 << (nr % BITS_PER_WORD));
+
+
+}
+
+inline void mlx4_clear_bit(unsigned int nr, uint32_t *addr)
+{
+ addr[(nr / BITS_PER_WORD)]
+ &= ~(1 << (nr % BITS_PER_WORD));
+}
+
+inline int mlx4_test_bit(unsigned int nr, const uint32_t *addr)
+{
+ return !!(addr[(nr / BITS_PER_WORD)]
+ & (1 << (nr % BITS_PER_WORD)));
+}
+
+inline uint32_t mlx4_find_first_zero_bit(const uint32_t *addr,
+ uint32_t size)
+{
+ const uint32_t *p = addr;
+ uint32_t result = 0;
+ uint32_t tmp;
+
+ while (size & ~(BITS_PER_WORD - 1)) {
+ tmp = *(p++);
+ if (~tmp)
+ goto found;
+ result += BITS_PER_WORD;
+ size -= BITS_PER_WORD;
+ }
+ if (!size)
+ return result;
+
+ tmp = (*p) | (~0UL << size);
+ if (tmp == (uint32_t)~0UL) /* Are any bits zero? */
+ return result + size; /* Nope. */
+found:
+ return result + mlx4_ffz(tmp);
+}
+
+int mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+ uint32_t obj;
+ int ret;
+
+ mlx4_spin_lock(&bitmap->lock);
+
+ obj = mlx4_find_first_zero_bit(bitmap->table, bitmap->max);
+ if (obj < bitmap->max) {
+ mlx4_set_bit(obj, bitmap->table);
+ bitmap->last = (obj + 1);
+ if (bitmap->last == bitmap->max)
+ bitmap->last = 0;
+ obj |= bitmap->top;
+ ret = obj;
+ } else
+ ret = -1;
+
+ if (ret != -1)
+ --bitmap->avail;
+
+ mlx4_spin_unlock(&bitmap->lock);
+
+ return ret;
+}
+
+static inline uint32_t find_aligned_range(uint32_t *bitmap,
+ uint32_t start, uint32_t nbits,
+ int len, int alignment)
+{
+ uint32_t end, i;
+
+again:
+ start = align(start, alignment);
+
+ while ((start < nbits) && mlx4_test_bit(start, bitmap))
+ start += alignment;
+
+ if (start >= nbits)
+ return -1;
+
+ end = start + len;
+ if (end > nbits)
+ return -1;
+
+ for (i = start + 1; i < end; i++) {
+ if (mlx4_test_bit(i, bitmap)) {
+ start = i + 1;
+ goto again;
+ }
+ }
+
+ return start;
+}
+
+static inline int mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+ int align)
+{
+ uint32_t obj;
+ int ret, i;
+
+ if (cnt == 1 && align == 1)
+ return mlx4_bitmap_alloc(bitmap);
+
+ if (cnt > bitmap->max)
+ return -1;
+
+ mlx4_spin_lock(&bitmap->lock);
+
+ obj = find_aligned_range(bitmap->table, bitmap->last,
+ bitmap->max, cnt, align);
+ if (obj >= bitmap->max) {
+ bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+ obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+ cnt, align);
+ }
+
+ if (obj < bitmap->max) {
+ for (i = 0; i < cnt; i++)
+ mlx4_set_bit(obj + i, bitmap->table);
+ if (obj == bitmap->last) {
+ bitmap->last = (obj + cnt);
+ if (bitmap->last >= bitmap->max)
+ bitmap->last = 0;
+ }
+ obj |= bitmap->top;
+ ret = obj;
+ } else
+ ret = -1;
+
+ if (ret != -1)
+ bitmap->avail -= cnt;
+
+ mlx4_spin_unlock(&bitmap->lock);
+
+ return obj;
+}
+
+static inline void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, uint32_t obj,
+ int cnt)
+{
+ int i;
+
+ obj &= bitmap->max - 1;
+
+ mlx4_spin_lock(&bitmap->lock);
+ for (i = 0; i < cnt; i++)
+ mlx4_clear_bit(obj + i, bitmap->table);
+ bitmap->last = min(bitmap->last, obj);
+ bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+ bitmap->avail += cnt;
+ mlx4_spin_unlock(&bitmap->lock);
+}
+
+static inline int is_bitmap_empty(struct mlx4_bitmap *bitmap)
+{
+ int ret;
+
+ mlx4_spin_lock(&bitmap->lock);
+ ret = (bitmap->avail == bitmap->max) ? 1 : 0;
+ mlx4_spin_unlock(&bitmap->lock);
+
+ return ret;
+}
+
+static inline int is_bitmap_avail(struct mlx4_bitmap *bitmap)
+{
+ int ret;
+
+ mlx4_spin_lock(&bitmap->lock);
+ ret = (bitmap->avail > 0) ? 1 : 0;
+ mlx4_spin_unlock(&bitmap->lock);
+
+ return ret;
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, uint32_t num, uint32_t mask)
+{
+ bitmap->last = 0;
+ bitmap->top = 0;
+ bitmap->max = bitmap->avail = num;
+ bitmap->mask = mask;
+ bitmap->avail = bitmap->max;
+ mlx4_spinlock_init(&bitmap->lock, !mlx4_single_threaded);
+ bitmap->table = malloc(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t));
+
+ if (!bitmap->table)
+ return -ENOMEM;
+ memset((void *)bitmap->table, 0,
+ (int)(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t)));
+ return 0;
+}
+
+inline void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+ if (bitmap->table)
+ free(bitmap->table);
+}
+
+static inline void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, uint32_t obj)
+{
+ mlx4_bitmap_free_range(bitmap, obj, 1);
+}
+
+#endif
Index: contrib/ofed/libmlx4/src/buf.c
===================================================================
--- contrib/ofed/libmlx4/src/buf.c
+++ contrib/ofed/libmlx4/src/buf.c
@@ -36,9 +36,21 @@
#include <stdlib.h>
#include <errno.h>
+#include <signal.h>
#include <sys/mman.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <stdio.h>
#include "mlx4.h"
+#include "bitmap.h"
+
+struct mlx4_hugetlb_mem {
+ int shmid;
+ char *shmaddr;
+ struct mlx4_bitmap bitmap;
+ struct list_head list;
+};
#if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE))
@@ -59,13 +71,154 @@
#endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */
+void mlx4_hugetlb_mem_free(struct mlx4_hugetlb_mem *hmem)
+{
+ mlx4_bitmap_cleanup(&hmem->bitmap);
+
+ if (shmdt((const void *)hmem->shmaddr) != 0) {
+ if (mlx4_trace)
+ perror("Detach shm failure");
+ }
+ free(hmem);
+}
+static void mlx4_free_buf_huge_ex(struct mlx4_context *mctx,
+ struct mlx4_buf *buf,
+ int do_fork)
+{
+ struct mlx4_hugetlb_mem *hmem;
+
+ if (do_fork)
+ ibv_dofork_range(buf->buf, buf->length);
+
+ if (buf->hmem == NULL) {
+ if (mlx4_trace)
+ perror("No hugetlb mem");
+ return;
+ }
+
+ hmem = (struct mlx4_hugetlb_mem *) buf->hmem;
+ mlx4_spin_lock(&mctx->hugetlb_lock);
+ mlx4_bitmap_free_range(&hmem->bitmap, buf->base,
+ buf->length/MLX4_Q_CHUNK_SIZE);
+
+ if (is_bitmap_empty(&hmem->bitmap)) {
+ list_del(&hmem->list);
+ mlx4_hugetlb_mem_free(hmem);
+ }
+ mlx4_spin_unlock(&mctx->hugetlb_lock);
+}
+
+void mlx4_free_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf)
+{
+ mlx4_free_buf_huge_ex(mctx, buf, 1);
+}
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 0
+#endif
+
+struct mlx4_hugetlb_mem *mxl4_hugetlb_mem_alloc(size_t size)
+{
+ struct mlx4_hugetlb_mem *hmem;
+ size_t shm_len;
+
+ hmem = malloc(sizeof(*hmem));
+ if (!hmem)
+ return NULL;
+
+ shm_len = (size > MLX4_SHM_LENGTH) ? align(size, MLX4_SHM_LENGTH) :
+ MLX4_SHM_LENGTH;
+ hmem->shmid = shmget(IPC_PRIVATE, shm_len,
+ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (hmem->shmid < 0) {
+ if (mlx4_trace)
+ perror("shmget");
+ free(hmem);
+ return NULL;
+ }
+
+ hmem->shmaddr = shmat(hmem->shmid, MLX4_SHM_ADDR, MLX4_SHMAT_FLAGS);
+ if (hmem->shmaddr == (char *)-1) {
+ if (mlx4_trace)
+ perror("Shared memory attach failure");
+ shmctl(hmem->shmid, IPC_RMID, NULL);
+ free(hmem);
+ return NULL;
+ }
+
+ if (mlx4_bitmap_init(&hmem->bitmap, shm_len/MLX4_Q_CHUNK_SIZE,
+ shm_len/MLX4_Q_CHUNK_SIZE - 1)) {
+ if (mlx4_trace)
+ perror("mlx4_bitmap_init");
+ mlx4_hugetlb_mem_free(hmem);
+ return NULL;
+ }
+
+ /* Marked to destroy when process detaches from shmget segment */
+ shmctl(hmem->shmid, IPC_RMID, NULL);
+
+ return hmem;
+}
+
+
+int mlx4_alloc_prefered_buf(struct mlx4_context *mctx,
+ struct mlx4_buf *buf,
+ size_t size, int page_size,
+ enum mlx4_alloc_type alloc_type,
+ const char *component)
+{
+ int ret = 1;
+
+ buf->hmem = NULL;
+ /* Fallback mechanism is used below:
+ priority is: huge pages , contig pages, default allocation */
+ if (alloc_type == MLX4_ALLOC_TYPE_HUGE ||
+ alloc_type == MLX4_ALLOC_TYPE_PREFER_HUGE ||
+ alloc_type == MLX4_ALLOC_TYPE_ALL) {
+ ret = mlx4_alloc_buf_huge(mctx, buf,
+ size,
+ page_size);
+ if (!ret)
+ return 0;
+
+ /* Checking whether HUGE is forced */
+ if (alloc_type == MLX4_ALLOC_TYPE_HUGE)
+ return -1;
+ if (mlx4_trace)
+ printf(PFX "Huge mode allocation has failed,fallback to %s mode\n",
+ MLX4_ALLOC_TYPE_ALL ? "contig" : "default");
+
+ }
+
+ if (alloc_type == MLX4_ALLOC_TYPE_CONTIG ||
+ alloc_type == MLX4_ALLOC_TYPE_PREFER_CONTIG ||
+ alloc_type == MLX4_ALLOC_TYPE_ALL) {
+ ret = mlx4_alloc_buf_contig(mctx, buf,
+ size,
+ page_size,
+ component, NULL);
+ if (!ret)
+ return 0;
+
+ /* Checking whether CONTIG is forced */
+ if (alloc_type == MLX4_ALLOC_TYPE_CONTIG)
+ return -1;
+ if (mlx4_trace)
+ printf(PFX "Contig mode allocation has failed,fallback to default mode\n");
+ }
+
+ return mlx4_alloc_buf(buf, size, page_size);
+
+}
+
+
int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
{
int ret;
buf->length = align(size, page_size);
buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANON, -1, 0);
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (buf->buf == MAP_FAILED)
return errno;
@@ -78,6 +231,271 @@
void mlx4_free_buf(struct mlx4_buf *buf)
{
- ibv_dofork_range(buf->buf, buf->length);
- munmap(buf->buf, buf->length);
+ if (buf->length) {
+ ibv_dofork_range(buf->buf, buf->length);
+ munmap(buf->buf, buf->length);
+ }
+}
+
+/* This function computes log2(v) rounded up.
+* We don't want to have a dependency to libm which exposes ceil & log2 APIs.
+* Code was written based on public domain code:
+ URL: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog.
+*/
+static uint32_t mlx4_get_block_order(uint32_t v)
+{
+ static const uint32_t bits_arr[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+ static const uint32_t shift_arr[] = {1, 2, 4, 8, 16};
+ int i;
+ uint32_t input_val = v;
+
+ register uint32_t r = 0;/* result of log2(v) will go here */
+ for (i = 4; i >= 0; i--) {
+
+ if (v & bits_arr[i]) {
+ v >>= shift_arr[i];
+ r |= shift_arr[i];
+ }
+ }
+ /* Rounding up if required */
+ r += !!(input_val & ((1 << r) - 1));
+
+ return r;
+}
+
+
+static int mlx4_finalize_contiguous_alloc(struct mlx4_buf *buf,
+ void *addr,
+ size_t length)
+{
+ if (ibv_dontfork_range(addr, length)) {
+ munmap(addr, length);
+ return 1;
+ }
+
+ /* We hook addr & length also internally for further
+ use via dreg_mr. On ibv_mr returned to user length or address may
+ be different than the allocated length or address as of alignment
+ issues.
+ */
+ buf->buf = addr;
+ buf->length = length;
+ return 0;
+
+}
+
+
+void mlx4_get_alloc_type(struct ibv_context *context, const char *component,
+ enum mlx4_alloc_type *alloc_type,
+ enum mlx4_alloc_type default_alloc_type)
+
+{
+ char env_value[VERBS_MAX_ENV_VAL];
+ char name_buff[128];
+
+ sprintf(name_buff, "%s_ALLOC_TYPE", component);
+
+ /* First set defaults */
+ *alloc_type = default_alloc_type;
+
+ if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
+ if (!strcasecmp(env_value, "ANON"))
+ *alloc_type = MLX4_ALLOC_TYPE_ANON;
+ else if (!strcasecmp(env_value, "HUGE"))
+ *alloc_type = MLX4_ALLOC_TYPE_HUGE;
+ else if (!strcasecmp(env_value, "CONTIG"))
+ *alloc_type = MLX4_ALLOC_TYPE_CONTIG;
+ else if (!strcasecmp(env_value, "PREFER_CONTIG"))
+ *alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
+ else if (!strcasecmp(env_value, "PREFER_HUGE"))
+ *alloc_type = MLX4_ALLOC_TYPE_PREFER_HUGE;
+ else if (!strcasecmp(env_value, "ALL"))
+ *alloc_type = MLX4_ALLOC_TYPE_ALL;
+ }
+
+ return;
+}
+
+
+static void mlx4_alloc_get_env_info(struct ibv_context *context,
+ int *max_log2_contig_block_size,
+ int *min_log2_contig_block_size,
+ const char *component)
+
+{
+ char env_value[VERBS_MAX_ENV_VAL];
+ int value;
+ char name_buff[128];
+
+ /* First set defaults */
+ *max_log2_contig_block_size = MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE;
+ *min_log2_contig_block_size = MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE;
+
+ sprintf(name_buff, "%s_MAX_LOG2_CONTIG_BSIZE", component);
+ if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
+ value = atoi(env_value);
+ if (value <= MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE &&
+ value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE)
+ *max_log2_contig_block_size = value;
+ else
+ fprintf(stderr,
+ "Invalid value %d for %s\n",
+ value, name_buff);
+ }
+ sprintf(name_buff, "%s_MIN_LOG2_CONTIG_BSIZE", component);
+ if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
+ value = atoi(env_value);
+ if (value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE &&
+ value <= *max_log2_contig_block_size)
+ *min_log2_contig_block_size = value;
+ else
+ fprintf(stderr,
+ "Invalid value %d for %s\n",
+ value, name_buff);
+ }
+ return;
}
+
+
+
+int mlx4_alloc_buf_contig(struct mlx4_context *mctx,
+ struct mlx4_buf *buf, size_t size,
+ int page_size,
+ const char *component, void *req_addr)
+{
+ void *addr = NULL;
+ int block_size_exp;
+ int max_log2_contig_block_size;
+ int min_log2_contig_block_size;
+ int mmap_flags = MAP_SHARED;
+ void *act_addr = NULL;
+ size_t act_size = size;
+
+ struct ibv_context *context = &(mctx->ibv_ctx);
+
+ mlx4_alloc_get_env_info(&mctx->ibv_ctx,
+ &max_log2_contig_block_size,
+ &min_log2_contig_block_size,
+ component);
+
+ /* Checking that we don't pass max block size */
+ if (size >= (1 << max_log2_contig_block_size))
+ block_size_exp = max_log2_contig_block_size;
+ else
+ block_size_exp = mlx4_get_block_order(size);
+
+ if (req_addr) {
+ act_addr = (void *)((uintptr_t)req_addr & ~((uintptr_t)page_size - 1));
+ act_size += (size_t)((uintptr_t)req_addr - (uintptr_t)act_addr);
+ mmap_flags |= MAP_FIXED;
+ }
+
+ do {
+ /* The second parameter holds the total required length for
+ this contiguous allocation aligned to page size.
+ When calling mmap the last offset parameter
+ should be a multiple of the page size and holds:
+ 1) Indication that we are in that mode of
+ allocation contiguous memory (value #2)
+ 2) The required size of each block.
+ To enable future actions on mmap we
+ use the last 3 bits of the offset parameter
+ as the command identifier.
+ */
+ addr = mmap(act_addr, act_size,
+ PROT_WRITE | PROT_READ, mmap_flags,
+ context->cmd_fd,
+ page_size *
+ (MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD +
+ (block_size_exp << MLX4_MMAP_CMD_BITS)));
+
+ /* On a failure MAP_FAILED (that is, (void *) -1) is returned*/
+ if (addr != MAP_FAILED)
+ break;
+
+ /* We failed - set addr to NULL and checks whether
+ a retry is relevant.
+ * If kernel doesn't support this command as of
+ compatibility issues we'll also get EINVAL.
+ */
+ addr = NULL;
+ if (errno == EINVAL)
+ break;
+
+ /* Retring asking for less contiguous pages per block */
+ block_size_exp -= 1;
+ } while (block_size_exp >= min_log2_contig_block_size);
+
+ if (!addr)
+ return 1;
+
+ /* All was ok we'll make final steps to have this addr ready*/
+ return mlx4_finalize_contiguous_alloc(buf, addr, act_size);
+}
+
+int mlx4_alloc_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf,
+ size_t size, int page_size)
+{
+ struct mlx4_hugetlb_mem *hmem, *tmp_hmem;
+ int found = 0;
+ int ret = 0;
+ LIST_HEAD(slist);
+
+ buf->length = align(size, MLX4_Q_CHUNK_SIZE);
+
+ mlx4_spin_lock(&mctx->hugetlb_lock);
+ list_for_each_entry_safe(hmem, tmp_hmem, &mctx->hugetlb_list, list) {
+ if (is_bitmap_avail(&hmem->bitmap)) {
+ buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap,
+ buf->length/MLX4_Q_CHUNK_SIZE, 1);
+ if (buf->base == -1)
+ continue;
+ else {
+ buf->hmem = (void *)hmem;
+ found = 1;
+ break;
+ }
+ }
+ }
+ mlx4_spin_unlock(&mctx->hugetlb_lock);
+
+ if (!found) {
+ int avail;
+
+ hmem = mxl4_hugetlb_mem_alloc(buf->length);
+ if (hmem == NULL)
+ return -1;
+
+ buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap,
+ buf->length/MLX4_Q_CHUNK_SIZE, 1);
+ if (buf->base == -1) {
+ if (mlx4_trace)
+ perror("mlx4_bitmap_alloc_range");
+ mlx4_hugetlb_mem_free(hmem);
+ return -1;
+ }
+
+ buf->hmem = (void *)hmem;
+
+ avail = is_bitmap_avail(&hmem->bitmap);
+ mlx4_spin_lock(&mctx->hugetlb_lock);
+ if (avail)
+ list_add(&hmem->list, &mctx->hugetlb_list);
+ else
+ list_add_tail(&hmem->list, &mctx->hugetlb_list);
+ mlx4_spin_unlock(&mctx->hugetlb_lock);
+ }
+
+ buf->buf = hmem->shmaddr + (buf->base * MLX4_Q_CHUNK_SIZE);
+
+ ret = ibv_dontfork_range(buf->buf, buf->length);
+ if (ret) {
+ mlx4_free_buf_huge_ex(mctx, buf, 0);
+ buf->hmem = NULL;
+ if (mlx4_trace)
+ perror("ibv_dontfork_range");
+ }
+
+ return ret;
+}
+
Index: contrib/ofed/libmlx4/src/cq.c
===================================================================
--- contrib/ofed/libmlx4/src/cq.c
+++ contrib/ofed/libmlx4/src/cq.c
@@ -47,6 +47,8 @@
#include "mlx4.h"
#include "doorbell.h"
+int mlx4_stall_num_loop = 300;
+
enum {
MLX4_CQ_DOORBELL = 0x20
};
@@ -61,8 +63,18 @@
#define MLX4_CQ_DB_REQ_NOT (2 << 24)
enum {
+ MLX4_CQE_L2_TUNNEL_IPV4 = 1 << 25,
+ MLX4_CQE_L2_TUNNEL_L4_CSUM = 1 << 26,
+ MLX4_CQE_L2_TUNNEL = 1 << 27,
+ MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29,
+ MLX4_CQE_L2_TUNNEL_IPOK = 1 << 31,
+ MLX4_CQE_QPN_MASK = 0xffffff,
+};
+
+enum {
MLX4_CQE_OWNER_MASK = 0x80,
MLX4_CQE_IS_SEND_MASK = 0x40,
+ MLX4_CQE_INL_SCATTER_MASK = 0x20,
MLX4_CQE_OPCODE_MASK = 0x1f
};
@@ -82,23 +94,50 @@
MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22,
};
+enum {
+ MLX4_CQE_STATUS_L4_CSUM = 1 << 2,
+ MLX4_CQE_STATUS_IPV4 = 1 << 6,
+ MLX4_CQE_STATUS_IPV4F = 1 << 7,
+ MLX4_CQE_STATUS_IPV6 = 1 << 8,
+ MLX4_CQE_STATUS_IPV4OPT = 1 << 9,
+ MLX4_CQE_STATUS_TCP = 1 << 10,
+ MLX4_CQE_STATUS_UDP = 1 << 11,
+ MLX4_CQE_STATUS_IPOK = 1 << 12
+};
+
+
struct mlx4_cqe {
- uint32_t my_qpn;
+ uint32_t vlan_my_qpn;
uint32_t immed_rss_invalid;
uint32_t g_mlpath_rqpn;
- uint8_t sl;
- uint8_t reserved1;
- uint16_t rlid;
- uint32_t reserved2;
+ union {
+ struct {
+ union {
+ struct {
+ uint16_t sl_vid;
+ uint16_t rlid;
+ };
+ uint32_t timestamp_16_47;
+ };
+ uint16_t status;
+ uint8_t reserved2;
+ uint8_t badfcs_enc;
+ };
+ struct {
+ uint16_t reserved4;
+ uint8_t smac[6];
+ };
+ };
uint32_t byte_cnt;
uint16_t wqe_index;
uint16_t checksum;
- uint8_t reserved3[3];
+ uint8_t reserved5[1];
+ uint16_t timestamp_0_15;
uint8_t owner_sr_opcode;
-};
+} __attribute__((packed));
struct mlx4_err_cqe {
- uint32_t my_qpn;
+ uint32_t vlan_my_qpn;
uint32_t reserved1[5];
uint16_t wqe_index;
uint8_t vendor_err;
@@ -118,7 +157,7 @@
struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe;
return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
- !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : tcqe;
+ !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe;
}
static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq)
@@ -126,18 +165,13 @@
return get_sw_cqe(cq, cq->cons_index);
}
-static void update_cons_index(struct mlx4_cq *cq)
-{
- *cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
-}
-
static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc)
{
if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR)
printf(PFX "local QP operation err "
"(QPN %06x, WQE index %x, vendor syndrome %02x, "
"opcode = %02x)\n",
- htonl(cqe->my_qpn), htonl(cqe->wqe_index),
+ htonl(cqe->vlan_my_qpn), htonl(cqe->wqe_index),
cqe->vendor_err,
cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
@@ -191,22 +225,34 @@
static int mlx4_poll_one(struct mlx4_cq *cq,
struct mlx4_qp **cur_qp,
- struct ibv_wc *wc)
+ struct ibv_exp_wc *wc,
+ uint32_t wc_size, int is_exp)
{
struct mlx4_wq *wq;
struct mlx4_cqe *cqe;
- struct mlx4_srq *srq = NULL;
+ struct mlx4_srq *srq;
uint32_t qpn;
- uint32_t srqn;
uint32_t g_mlpath_rqpn;
uint16_t wqe_index;
int is_error;
int is_send;
-
+ int size;
+ int left;
+ int list_len;
+ int i;
+ struct mlx4_inlr_rbuff *rbuffs;
+ uint8_t *sbuff;
+ int timestamp_en = !!(cq->creation_flags &
+ IBV_EXP_CQ_TIMESTAMP);
+ uint64_t exp_wc_flags = 0;
+ uint64_t wc_flags = 0;
cqe = next_cqe_sw(cq);
if (!cqe)
return CQ_EMPTY;
+ if (cq->cqe_size == 64)
+ ++cqe;
+
++cq->cons_index;
VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
@@ -217,36 +263,44 @@
*/
rmb();
- qpn = ntohl(cqe->my_qpn);
+ qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+ wc->qp_num = qpn;
is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+
+ /* include checksum as work around for calc opcode */
is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
- MLX4_CQE_OPCODE_ERROR;
+ MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff);
- if (qpn & MLX4_XRC_QPN_BIT && !is_send) {
- srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff;
+ if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) {
/*
- * We do not have to take the XRC SRQ table lock here,
- * because CQs will be locked while XRC SRQs are removed
+ * We do not have to take the XSRQ table lock here,
+ * because CQs will be locked while SRQs are removed
* from the table.
*/
- srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn);
+ *cur_qp = NULL;
+ srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table,
+ ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
if (!srq)
return CQ_POLL_ERR;
- } else if (!*cur_qp || (qpn & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
- /*
- * We do not have to take the QP table lock here,
- * because CQs will be locked while QPs are removed
- * from the table.
- */
- *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context),
- qpn & 0xffffff);
- if (!*cur_qp)
- return CQ_POLL_ERR;
+ } else {
+ if (unlikely(!*cur_qp || (qpn != (*cur_qp)->verbs_qp.qp.qp_num))) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+ if (unlikely(!*cur_qp))
+ return CQ_POLL_ERR;
+ }
+ if (is_exp) {
+ wc->qp = &((*cur_qp)->verbs_qp.qp);
+ exp_wc_flags |= IBV_EXP_WC_QP;
+ }
+ srq = ((*cur_qp)->verbs_qp.qp.srq) ? to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL;
}
- wc->qp_num = qpn & 0xffffff;
-
if (is_send) {
wq = &(*cur_qp)->sq;
wqe_index = ntohs(cqe->wqe_index);
@@ -257,112 +311,267 @@
wqe_index = htons(cqe->wqe_index);
wc->wr_id = srq->wrid[wqe_index];
mlx4_free_srq_wqe(srq, wqe_index);
- } else if ((*cur_qp)->ibv_qp.srq) {
- srq = to_msrq((*cur_qp)->ibv_qp.srq);
- wqe_index = htons(cqe->wqe_index);
- wc->wr_id = srq->wrid[wqe_index];
- mlx4_free_srq_wqe(srq, wqe_index);
+ if (is_exp) {
+ wc->srq = &(srq->verbs_srq.srq);
+ exp_wc_flags |= IBV_EXP_WC_SRQ;
+ }
} else {
wq = &(*cur_qp)->rq;
- wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ wqe_index = wq->tail & (wq->wqe_cnt - 1);
+ wc->wr_id = wq->wrid[wqe_index];
++wq->tail;
}
- if (is_error) {
- mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+ if (unlikely(is_error)) {
+ mlx4_handle_error_cqe((struct mlx4_err_cqe *)cqe,
+ (struct ibv_wc *)wc);
return CQ_OK;
}
wc->status = IBV_WC_SUCCESS;
+ if (timestamp_en && offsetof(struct ibv_exp_wc, timestamp) < wc_size) {
+ /* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is
+ * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't
+ * supported */
+ if (cq->creation_flags &
+ IBV_EXP_CQ_TIMESTAMP_TO_SYS_TIME)
+ wc->timestamp = 0;
+ else {
+ wc->timestamp =
+ (uint64_t)(ntohl(cqe->timestamp_16_47) +
+ !cqe->timestamp_0_15) << 16
+ | (uint64_t)ntohs(cqe->timestamp_0_15);
+ exp_wc_flags |= IBV_EXP_WC_WITH_TIMESTAMP;
+ }
+ }
+
if (is_send) {
- wc->wc_flags = 0;
switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_CALC_RDMA_WRITE_IMM:
case MLX4_OPCODE_RDMA_WRITE_IMM:
- wc->wc_flags |= IBV_WC_WITH_IMM;
+ wc_flags |= IBV_WC_WITH_IMM;
case MLX4_OPCODE_RDMA_WRITE:
- wc->opcode = IBV_WC_RDMA_WRITE;
+ wc->exp_opcode = IBV_EXP_WC_RDMA_WRITE;
break;
case MLX4_OPCODE_SEND_IMM:
- wc->wc_flags |= IBV_WC_WITH_IMM;
+ wc_flags |= IBV_WC_WITH_IMM;
case MLX4_OPCODE_SEND:
- wc->opcode = IBV_WC_SEND;
+ wc->exp_opcode = IBV_EXP_WC_SEND;
break;
case MLX4_OPCODE_RDMA_READ:
- wc->opcode = IBV_WC_RDMA_READ;
+ wc->exp_opcode = IBV_EXP_WC_RDMA_READ;
wc->byte_len = ntohl(cqe->byte_cnt);
break;
case MLX4_OPCODE_ATOMIC_CS:
- wc->opcode = IBV_WC_COMP_SWAP;
+ wc->exp_opcode = IBV_EXP_WC_COMP_SWAP;
wc->byte_len = 8;
break;
case MLX4_OPCODE_ATOMIC_FA:
- wc->opcode = IBV_WC_FETCH_ADD;
+ wc->exp_opcode = IBV_EXP_WC_FETCH_ADD;
wc->byte_len = 8;
break;
+ case MLX4_OPCODE_ATOMIC_MASK_CS:
+ wc->exp_opcode = IBV_EXP_WC_MASKED_COMP_SWAP;
+ break;
+ case MLX4_OPCODE_ATOMIC_MASK_FA:
+ wc->exp_opcode = IBV_EXP_WC_MASKED_FETCH_ADD;
+ break;
+ case MLX4_OPCODE_LOCAL_INVAL:
+ if (unlikely(!is_exp))
+ return CQ_POLL_ERR;
+ wc->exp_opcode = IBV_EXP_WC_LOCAL_INV;
+ break;
+ case MLX4_OPCODE_SEND_INVAL:
+ wc->exp_opcode = IBV_EXP_WC_SEND;
+ break;
case MLX4_OPCODE_BIND_MW:
- wc->opcode = IBV_WC_BIND_MW;
+ wc->exp_opcode = IBV_EXP_WC_BIND_MW;
break;
default:
/* assume it's a send completion */
- wc->opcode = IBV_WC_SEND;
+ wc->exp_opcode = IBV_EXP_WC_SEND;
break;
}
} else {
wc->byte_len = ntohl(cqe->byte_cnt);
+ if ((*cur_qp) && (*cur_qp)->max_inlr_sg &&
+ (cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) {
+ rbuffs = (*cur_qp)->inlr_buff.buff[wqe_index].sg_list;
+ list_len = (*cur_qp)->inlr_buff.buff[wqe_index].list_len;
+ sbuff = mlx4_get_recv_wqe((*cur_qp), wqe_index);
+ left = wc->byte_len;
+ for (i = 0; (i < list_len) && left; i++) {
+ size = min(rbuffs->rlen, left);
+ memcpy(rbuffs->rbuff, sbuff, size);
+ left -= size;
+ rbuffs++;
+ sbuff += size;
+ }
+ if (left) {
+ wc->status = IBV_WC_LOC_LEN_ERR;
+ return CQ_OK;
+ }
+ }
switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
- wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
- wc->wc_flags = IBV_WC_WITH_IMM;
+ wc->exp_opcode = IBV_EXP_WC_RECV_RDMA_WITH_IMM;
+ wc_flags = IBV_WC_WITH_IMM;
wc->imm_data = cqe->immed_rss_invalid;
break;
+ case MLX4_RECV_OPCODE_SEND_INVAL:
+ if (unlikely(!is_exp))
+ return CQ_POLL_ERR;
+ wc->exp_opcode = IBV_EXP_WC_RECV;
+ exp_wc_flags |= IBV_EXP_WC_WITH_INV;
+ wc->imm_data = ntohl(cqe->immed_rss_invalid);
+ break;
case MLX4_RECV_OPCODE_SEND:
- wc->opcode = IBV_WC_RECV;
- wc->wc_flags = 0;
+ wc->exp_opcode = IBV_EXP_WC_RECV;
+ wc_flags = 0;
break;
case MLX4_RECV_OPCODE_SEND_IMM:
- wc->opcode = IBV_WC_RECV;
- wc->wc_flags = IBV_WC_WITH_IMM;
+ wc->exp_opcode = IBV_EXP_WC_RECV;
+ wc_flags = IBV_WC_WITH_IMM;
wc->imm_data = cqe->immed_rss_invalid;
break;
}
- wc->slid = ntohs(cqe->rlid);
- wc->sl = cqe->sl >> 4;
+ if (!timestamp_en) {
+ exp_wc_flags |= IBV_EXP_WC_WITH_SLID;
+ wc->slid = ntohs(cqe->rlid);
+ }
g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn);
wc->src_qp = g_mlpath_rqpn & 0xffffff;
wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
- wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
+ wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
wc->pkey_index = ntohl(cqe->immed_rss_invalid) & 0x7f;
+ /* When working with xrc srqs, don't have qp to check link layer.
+ * Using IB SL, should consider Roce. (TBD)
+ */
+ /* sl is invalid when timestamp is used */
+ if (!timestamp_en) {
+ if ((*cur_qp) && (*cur_qp)->link_layer ==
+ IBV_LINK_LAYER_ETHERNET)
+ wc->sl = ntohs(cqe->sl_vid) >> 13;
+ else
+ wc->sl = ntohs(cqe->sl_vid) >> 12;
+ exp_wc_flags |= IBV_EXP_WC_WITH_SL;
+ }
+ if (is_exp) {
+ if ((*cur_qp) && ((*cur_qp)->qp_cap_cache &
+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP)) {
+ /* Only ConnectX-3 Pro reports checksum for now) */
+ exp_wc_flags |=
+ MLX4_TRANSPOSE(cqe->badfcs_enc,
+ MLX4_CQE_STATUS_L4_CSUM,
+ (uint64_t)IBV_EXP_WC_RX_TCP_UDP_CSUM_OK) |
+ mlx4_transpose_uint16_t(cqe->status,
+ htons(MLX4_CQE_STATUS_IPOK),
+ (uint64_t)IBV_EXP_WC_RX_IP_CSUM_OK) |
+ mlx4_transpose_uint16_t(cqe->status,
+ htons(MLX4_CQE_STATUS_IPV4),
+ (uint64_t)IBV_EXP_WC_RX_IPV4_PACKET) |
+ mlx4_transpose_uint16_t(cqe->status,
+ htons(MLX4_CQE_STATUS_IPV6),
+ (uint64_t)IBV_EXP_WC_RX_IPV6_PACKET) |
+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+ htonl(MLX4_CQE_L2_TUNNEL),
+ (uint64_t)IBV_EXP_WC_RX_TUNNEL_PACKET) |
+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+ htonl(MLX4_CQE_L2_TUNNEL_IPOK),
+ (uint64_t)IBV_EXP_WC_RX_OUTER_IP_CSUM_OK) |
+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+ htonl(MLX4_CQE_L2_TUNNEL_L4_CSUM),
+ (uint64_t)IBV_EXP_WC_RX_OUTER_TCP_UDP_CSUM_OK) |
+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+ htonl(MLX4_CQE_L2_TUNNEL_IPV4),
+ (uint64_t)IBV_EXP_WC_RX_OUTER_IPV4_PACKET);
+ exp_wc_flags |=
+ MLX4_TRANSPOSE(~exp_wc_flags,
+ IBV_EXP_WC_RX_OUTER_IPV4_PACKET,
+ IBV_EXP_WC_RX_OUTER_IPV6_PACKET);
+ }
+ }
}
+ if (is_exp)
+ wc->exp_wc_flags = exp_wc_flags | (uint64_t)wc_flags;
+
+ ((struct ibv_wc *)wc)->wc_flags = wc_flags;
+
return CQ_OK;
}
-int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+#if defined(__amd64__) || defined(__i386__)
+static inline unsigned long get_cycles()
+{
+ unsigned low, high;
+ unsigned long long val;
+ asm volatile ("rdtsc" : "=a" (low), "=d" (high));
+ val = high;
+ val = (val << 32) | low;
+ return val;
+}
+#else
+static inline unsigned long get_cycles()
+{
+ return 0;
+}
+#endif
+
+static void mlx4_stall_poll_cq()
+{
+ int i;
+
+ for (i = 0; i < mlx4_stall_num_loop; i++)
+ (void)get_cycles();
+}
+
+int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_exp_wc *wc,
+ uint32_t wc_size, int is_exp)
{
struct mlx4_cq *cq = to_mcq(ibcq);
struct mlx4_qp *qp = NULL;
int npolled;
int err = CQ_OK;
- pthread_spin_lock(&cq->lock);
-
+ if (unlikely(cq->stall_next_poll)) {
+ cq->stall_next_poll = 0;
+ mlx4_stall_poll_cq();
+ }
+ mlx4_lock(&cq->lock);
+
for (npolled = 0; npolled < ne; ++npolled) {
- err = mlx4_poll_one(cq, &qp, wc + npolled);
- if (err != CQ_OK)
+ err = mlx4_poll_one(cq, &qp, ((void *)wc) + npolled * wc_size,
+ wc_size, is_exp);
+ if (unlikely(err != CQ_OK))
break;
}
- if (npolled)
- update_cons_index(cq);
+ if (likely(npolled || err == CQ_POLL_ERR))
+ mlx4_update_cons_index(cq);
- pthread_spin_unlock(&cq->lock);
+ mlx4_unlock(&cq->lock);
+ if (unlikely(cq->stall_enable && err == CQ_EMPTY))
+ cq->stall_next_poll = 1;
+
return err == CQ_POLL_ERR ? err : npolled;
}
+int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries,
+ struct ibv_exp_wc *wc, uint32_t wc_size)
+{
+ return mlx4_poll_cq(ibcq, num_entries, wc, wc_size, 1);
+}
+
+int mlx4_poll_ibv_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+{
+ return mlx4_poll_cq(ibcq, ne, (struct ibv_exp_wc *)wc, sizeof(*wc), 0);
+}
+
int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
{
struct mlx4_cq *cq = to_mcq(ibvcq);
@@ -402,12 +611,10 @@
uint32_t prod_index;
uint8_t owner_bit;
int nfreed = 0;
- int is_xrc_srq = 0;
int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
- if (srq && srq->ibv_srq.xrc_cq)
- is_xrc_srq = 1;
-
+ if (cq->last_qp && cq->last_qp->verbs_qp.qp.qp_num == qpn)
+ cq->last_qp = NULL;
/*
* First we need to find the current producer index, so we
* know where to start cleaning from. It doesn't matter if HW
@@ -426,12 +633,12 @@
while ((int) --prod_index - (int) cq->cons_index >= 0) {
cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
cqe += cqe_inc;
- if (is_xrc_srq &&
- (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) &&
+ if (srq && srq->ext_srq &&
+ ntohl(cqe->g_mlpath_rqpn & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num &&
!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
++nfreed;
- } else if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
+ } else if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
++nfreed;
@@ -452,22 +659,22 @@
* updating consumer index.
*/
wmb();
- update_cons_index(cq);
+ mlx4_update_cons_index(cq);
}
}
void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
{
- pthread_spin_lock(&cq->lock);
+ mlx4_lock(&cq->lock);
__mlx4_cq_clean(cq, qpn, srq);
- pthread_spin_unlock(&cq->lock);
+ mlx4_unlock(&cq->lock);
}
int mlx4_get_outstanding_cqes(struct mlx4_cq *cq)
{
uint32_t i;
- for (i = cq->cons_index; get_sw_cqe(cq, (i & cq->ibv_cq.cqe)); ++i)
+ for (i = cq->cons_index; get_sw_cqe(cq, i); ++i)
;
return i - cq->cons_index;
@@ -496,13 +703,491 @@
++cq->cons_index;
}
-int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+int mlx4_alloc_cq_buf(struct mlx4_context *mctx, struct mlx4_buf *buf, int nent,
int entry_size)
{
- if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size),
- dev->page_size))
+ struct mlx4_device *dev = to_mdev(mctx->ibv_ctx.device);
+ int ret;
+ enum mlx4_alloc_type alloc_type;
+ enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
+
+ if (mlx4_use_huge(&mctx->ibv_ctx, "HUGE_CQ"))
+ default_alloc_type = MLX4_ALLOC_TYPE_HUGE;
+
+ mlx4_get_alloc_type(&mctx->ibv_ctx, MLX4_CQ_PREFIX, &alloc_type,
+ default_alloc_type);
+
+ ret = mlx4_alloc_prefered_buf(mctx, buf,
+ align(nent * entry_size, dev->page_size),
+ dev->page_size,
+ alloc_type,
+ MLX4_CQ_PREFIX);
+
+ if (ret)
return -1;
+
memset(buf->buf, 0, nent * entry_size);
return 0;
}
+
+/*
+ * poll family functions
+ */
+static inline int drain_rx(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+ struct mlx4_qp *cur_qp, uint8_t *buf, uint32_t *inl) __attribute__((always_inline));
+static inline int drain_rx(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+ struct mlx4_qp *cur_qp, uint8_t *buf, uint32_t *inl)
+{
+ struct mlx4_srq *srq;
+ uint32_t qpn;
+ uint16_t wqe_index;
+
+ qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+
+
+ if (unlikely(!cur_qp || (qpn != cur_qp->verbs_qp.qp.qp_num))) {
+ if (unlikely(qpn & MLX4_XRC_QPN_BIT)) {
+ /*
+ * We do not have to take the XSRQ table lock here,
+ * because CQs will be locked while SRQs are removed
+ * from the table.
+ */
+ cur_qp = NULL;
+ srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table,
+ ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
+ if (!srq)
+ return CQ_POLL_ERR;
+
+ /* Advance indexes only on success */
+ wqe_index = htons(cqe->wqe_index);
+ mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index);
+
+ ++cq->cons_index;
+
+ return CQ_OK;
+ }
+
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+ if (unlikely(!cur_qp))
+ return CQ_POLL_ERR;
+ cq->last_qp = cur_qp;
+ }
+
+ if (!cur_qp->max_inlr_sg) {
+ /* Advance indexes only on success to enable getting
+ * the full CQE with ibv_poll_cq in case of failure
+ */
+ if (unlikely(cur_qp->verbs_qp.qp.srq)) {
+ wqe_index = htons(cqe->wqe_index);
+ mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index);
+ } else {
+ ++cur_qp->rq.tail;
+ }
+ ++cq->cons_index;
+
+ return CQ_OK;
+ }
+
+ /* We get here only when cur_qp->max_inlr_sg != 0 */
+ if (likely(cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) {
+ int size;
+ int left;
+ int list_len;
+ int i;
+ struct mlx4_inlr_rbuff *rbuffs;
+ uint8_t *sbuff;
+ int is_error;
+
+ /* include checksum as work around for calc opcode */
+ is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+ MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff);
+ if (unlikely(is_error))
+ return CQ_POLL_ERR;
+
+ wqe_index = cur_qp->rq.tail & (cur_qp->rq.wqe_cnt - 1);
+ sbuff = mlx4_get_recv_wqe(cur_qp, wqe_index);
+ left = ntohl(cqe->byte_cnt);
+ if (likely(buf)) {
+ *inl = 1;
+ memcpy(buf, sbuff, left);
+ } else {
+ rbuffs = cur_qp->inlr_buff.buff[wqe_index].sg_list;
+ list_len = cur_qp->inlr_buff.buff[wqe_index].list_len;
+ for (i = 0; (i < list_len) && left; i++) {
+ size = min(rbuffs->rlen, left);
+ memcpy(rbuffs->rbuff, sbuff, size);
+ left -= size;
+ rbuffs++;
+ sbuff += size;
+ }
+ if (left)
+ return CQ_POLL_ERR;
+ }
+ }
+
+ /* Advance indexes only on success to enable getting
+ * the full CQE with ibv_poll_cq in case of failure
+ */
+ ++cur_qp->rq.tail;
+
+ ++cq->cons_index;
+
+ return CQ_OK;
+}
+
+static inline int update_sq_tail(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+ struct mlx4_qp *cur_qp) __attribute__((always_inline));
+static inline int update_sq_tail(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+ struct mlx4_qp *cur_qp)
+{
+ uint32_t qpn;
+
+ qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+ if (unlikely(!cur_qp || (qpn != cur_qp->verbs_qp.qp.qp_num))) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+ if (unlikely(!cur_qp))
+ return CQ_POLL_ERR;
+ cq->last_qp = cur_qp;
+ }
+
+ /* Advance indexes only on success */
+ cur_qp->sq.tail += (uint16_t)(ntohs(cqe->wqe_index) - (uint16_t)cur_qp->sq.tail);
+ ++cq->cons_index;
+
+ return CQ_OK;
+}
+
+static inline struct mlx4_cqe *get_next_cqe(struct mlx4_cq *cq, int const cqe_size) __attribute__((always_inline));
+static inline struct mlx4_cqe *get_next_cqe(struct mlx4_cq *cq, int const cqe_size)
+{
+ int cqe_off = (cqe_size & 64) >> 1; /* CQE offset is 32 bytes in case cqe_size == 64 */
+ struct mlx4_cqe *cqe = cq->buf.buf + (cq->cons_index & cq->ibv_cq.cqe) * cqe_size + cqe_off;
+
+ if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(cq->cons_index & (cq->ibv_cq.cqe + 1)))
+ return NULL;
+
+ VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
+
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+
+ return cqe;
+}
+
+static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size) __attribute__((always_inline));
+static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+ struct mlx4_cqe *cqe;
+ int npolled;
+ int err = CQ_OK;
+
+ if (unlikely(use_lock))
+ mlx4_lock(&cq->lock);
+
+ for (npolled = 0; npolled < max_entries; ++npolled) {
+ cqe = get_next_cqe(cq, cqe_size);
+ if (!cqe) {
+ err = CQ_EMPTY;
+ break;
+ }
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+
+ if (likely(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+ err = update_sq_tail(cq, cqe, cq->last_qp);
+ else
+ err = drain_rx(cq, cqe, cq->last_qp, NULL, NULL);
+
+ if (unlikely(err != CQ_OK))
+ break;
+ }
+
+ if (likely(npolled)) {
+ mlx4_update_cons_index(cq);
+ err = CQ_OK;
+ }
+
+ if (unlikely(use_lock))
+ mlx4_unlock(&cq->lock);
+
+ return err == CQ_POLL_ERR ? -1 : npolled;
+}
+
+static inline int32_t get_flags(struct mlx4_qp *cur_qp, struct mlx4_cqe *cqe) __attribute__((always_inline));
+static inline int32_t get_flags(struct mlx4_qp *cur_qp, struct mlx4_cqe *cqe)
+{
+ /* Only ConnectX-3 Pro reports checksum for now) */
+ if (likely(cur_qp && (cur_qp->qp_cap_cache &
+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP))) {
+ int32_t flags;
+ int32_t tmp;
+
+ /*
+ * The relevant bits are in different locations on their
+ * CQE fields therefore we can join them in one 32bit
+ * variable.
+ */
+ tmp = (cqe->badfcs_enc & MLX4_CQE_STATUS_L4_CSUM) |
+ (ntohs(cqe->status) & (MLX4_CQE_STATUS_IPOK |
+ MLX4_CQE_STATUS_IPV4 |
+ MLX4_CQE_STATUS_IPV6)) |
+ (ntohl(cqe->vlan_my_qpn) & (MLX4_CQE_L2_TUNNEL |
+ MLX4_CQE_L2_TUNNEL_IPOK |
+ MLX4_CQE_L2_TUNNEL_L4_CSUM |
+ MLX4_CQE_L2_TUNNEL_IPV4));
+ if (likely(tmp == cur_qp->cached_rx_csum_flags)) {
+ flags = cur_qp->transposed_rx_csum_flags;
+ } else {
+ flags = mlx4_transpose(tmp, MLX4_CQE_STATUS_IPOK, IBV_EXP_CQ_RX_IP_CSUM_OK) |
+ mlx4_transpose(tmp, MLX4_CQE_STATUS_L4_CSUM, IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK) |
+ mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV4, IBV_EXP_CQ_RX_IPV4_PACKET) |
+ mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV6, IBV_EXP_CQ_RX_IPV6_PACKET) |
+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL, IBV_EXP_CQ_RX_TUNNEL_PACKET) |
+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPOK, IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK) |
+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_L4_CSUM, IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK) |
+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPV4, IBV_EXP_CQ_RX_OUTER_IPV4_PACKET) |
+ mlx4_transpose(~tmp, MLX4_CQE_L2_TUNNEL_IPV4, IBV_EXP_CQ_RX_OUTER_IPV6_PACKET);
+ cur_qp->cached_rx_csum_flags = tmp;
+ cur_qp->transposed_rx_csum_flags = flags;
+ }
+
+ return flags;
+ }
+
+ return 0;
+}
+
+static inline int32_t poll_length(struct ibv_cq *ibcq, void *buf, uint32_t *inl,
+ const int use_lock, const int cqe_size,
+ uint32_t *flags) __attribute__((always_inline));
+static inline int32_t poll_length(struct ibv_cq *ibcq, void *buf, uint32_t *inl,
+ const int use_lock, const int cqe_size,
+ uint32_t *flags)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+ struct mlx4_cqe *cqe;
+ int32_t size = 0;
+ int err;
+
+ if (unlikely(use_lock))
+ mlx4_lock(&cq->lock);
+
+ cqe = get_next_cqe(cq, cqe_size);
+ if (cqe) {
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+ if (likely(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))) {
+ err = drain_rx(cq, cqe, cq->last_qp, buf, inl);
+ if (likely(err == CQ_OK)) {
+ size = ntohl(cqe->byte_cnt);
+ if (flags)
+ *flags = get_flags(cq->last_qp, cqe);
+ mlx4_update_cons_index(cq);
+ }
+ } else {
+ err = CQ_POLL_ERR;
+ }
+
+ } else {
+ err = CQ_EMPTY;
+ }
+
+
+ if (unlikely(use_lock))
+ mlx4_unlock(&cq->lock);
+
+ return err == CQ_POLL_ERR ? -1 : size;
+}
+
+int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+
+ return poll_cnt(ibcq, max, 1, cq->cqe_size);
+}
+
+int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+
+ return poll_cnt(ibcq, max, 0, cq->cqe_size);
+}
+
+int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max)
+{
+ return poll_cnt(ibcq, max, 0, 32);
+}
+
+int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max)
+{
+ return poll_cnt(ibcq, max, 0, 64);
+}
+
+int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max)
+{
+ return poll_cnt(ibcq, max, 0, 128);
+}
+
+int32_t mlx4_poll_length_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+
+ return poll_length(ibcq, buf, inl, 1, cq->cqe_size, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+
+ return poll_length(ibcq, buf, inl, 0, cq->cqe_size, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl)
+{
+ return poll_length(cq, buf, inl, 0, 32, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl)
+{
+ return poll_length(cq, buf, inl, 0, 64, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl)
+{
+ return poll_length(cq, buf, inl, 0, 128, NULL);
+}
+
+int32_t mlx4_poll_length_flags_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+
+ return poll_length(ibcq, buf, inl, 1, cq->cqe_size, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+ struct mlx4_cq *cq = to_mcq(ibcq);
+
+ return poll_length(ibcq, buf, inl, 0, cq->cqe_size, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+ return poll_length(cq, buf, inl, 0, 32, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+ return poll_length(cq, buf, inl, 0, 64, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+ return poll_length(cq, buf, inl, 0, 128, flags);
+}
+
+static struct ibv_exp_cq_family mlx4_poll_cq_family_safe = {
+ .poll_cnt = mlx4_poll_cnt_safe,
+ .poll_length = mlx4_poll_length_safe,
+ .poll_length_flags = mlx4_poll_length_flags_safe
+};
+
+enum mlx4_poll_cq_cqe_sizes {
+ MLX4_POLL_CQ_CQE_32 = 0,
+ MLX4_POLL_CQ_CQE_64 = 1,
+ MLX4_POLL_CQ_CQE_128 = 2,
+ MLX4_POLL_CQ_CQE_OTHER = 3,
+ MLX4_POLL_CQ_NUM_CQE_SIZES = 4,
+};
+
+static struct ibv_exp_cq_family mlx4_poll_cq_family_unsafe_tbl[MLX4_POLL_CQ_NUM_CQE_SIZES] = {
+ [MLX4_POLL_CQ_CQE_32] = {
+ .poll_cnt = mlx4_poll_cnt_unsafe_cqe32,
+ .poll_length = mlx4_poll_length_unsafe_cqe32,
+ .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe32
+ },
+ [MLX4_POLL_CQ_CQE_64] = {
+ .poll_cnt = mlx4_poll_cnt_unsafe_cqe64,
+ .poll_length = mlx4_poll_length_unsafe_cqe64,
+ .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe64
+ },
+ [MLX4_POLL_CQ_CQE_128] = {
+ .poll_cnt = mlx4_poll_cnt_unsafe_cqe128,
+ .poll_length = mlx4_poll_length_unsafe_cqe128,
+ .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe128
+ },
+ [MLX4_POLL_CQ_CQE_OTHER] = {
+ .poll_cnt = mlx4_poll_cnt_unsafe_other,
+ .poll_length = mlx4_poll_length_unsafe_other,
+ .poll_length_flags = mlx4_poll_length_flags_unsafe_other
+ },
+};
+
+struct ibv_exp_cq_family *mlx4_get_poll_cq_family(struct mlx4_cq *cq,
+ struct ibv_exp_query_intf_params *params,
+ enum ibv_exp_query_intf_status *status)
+{
+ enum mlx4_poll_cq_cqe_sizes cqe_size = MLX4_POLL_CQ_CQE_OTHER;
+
+ if (params->flags) {
+ fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for CQ family\n", params->flags);
+ *status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED;
+
+ return NULL;
+ }
+ if (params->family_flags) {
+ fprintf(stderr, PFX "Family flags(0x%x) are not supported for CQ family\n", params->family_flags);
+ *status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED;
+
+ return NULL;
+ }
+
+ if (cq->model_flags & MLX4_CQ_MODEL_FLAG_THREAD_SAFE)
+ return &mlx4_poll_cq_family_safe;
+
+ if (cq->cqe_size == 32)
+ cqe_size = MLX4_POLL_CQ_CQE_32;
+ else if (cq->cqe_size == 64)
+ cqe_size = MLX4_POLL_CQ_CQE_64;
+ else if (cq->cqe_size == 128)
+ cqe_size = MLX4_POLL_CQ_CQE_128;
+
+ return &mlx4_poll_cq_family_unsafe_tbl[cqe_size];
+}
Index: contrib/ofed/libmlx4/src/doorbell.h
===================================================================
--- contrib/ofed/libmlx4/src/doorbell.h
+++ contrib/ofed/libmlx4/src/doorbell.h
@@ -33,7 +33,8 @@
#ifndef DOORBELL_H
#define DOORBELL_H
-#ifdef __LP64__
+#if __LP64__
+
#if __BYTE_ORDER == __LITTLE_ENDIAN
# define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0])
#elif __BYTE_ORDER == __BIG_ENDIAN
@@ -51,10 +52,10 @@
static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
{
- pthread_spin_lock(&ctx->uar_lock);
+ mlx4_spin_lock(&ctx->uar_lock);
*(volatile uint32_t *) (ctx->uar + offset) = val[0];
*(volatile uint32_t *) (ctx->uar + offset + 4) = val[1];
- pthread_spin_unlock(&ctx->uar_lock);
+ mlx4_spin_unlock(&ctx->uar_lock);
}
#endif
Index: contrib/ofed/libmlx4/src/list.h
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/list.h
@@ -0,0 +1,330 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head *prev, struct list_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+ struct list_head *head)
+{
+ __list_del(list->prev, list->next);
+ list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is
+ * empty _and_ checks that no other CPU might be
+ * in the process of still modifying either member
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ *
+ * @head: the list to test.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+ struct list_head *next = head->next;
+ return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+ struct list_head *head)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+ struct list_head *at = head->next;
+
+ first->prev = head;
+ head->next = first;
+
+ last->next = at;
+ at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list)) {
+ __list_splice(list, head);
+ INIT_LIST_HEAD(list);
+ }
+}
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ *
+ * @ptr: the pointer to the member.
+ * @type: the type of the container struct this is embedded in.
+ * @member: the name of the member within the struct.
+ *
+ */
+#ifndef container_of
+#define container_of(ptr, type, member) ({ \
+ const typeof(((type *)0)->member)*__mptr = (ptr); \
+ (type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr: the &struct list_head pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+/**
+ * list_for_each - iterate over a list
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define list_for_each(pos, head) \
+ for (pos = (head)->next; prefetch(pos->next), pos != (head); \
+ pos->next)
+
+/**
+ * __list_for_each - iterate over a list
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev - iterate over a list backwards
+ * @pos: the &struct list_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+ for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \
+ pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos: the &struct list_head to use as a loop counter.
+ * @n: another &struct list_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry - iterate over list of given type
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ prefetch(pos->member.next), &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member); \
+ prefetch(pos->member.prev), &pos->member != (head); \
+ pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use as a start point in
+ * list_for_each_entry_continue
+ * @pos: the type * to use as a start point
+ * @head: the head of the list
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_prepare_entry(pos, head, member) \
+ ((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - iterate over list of given type
+ * continuing after existing point
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_continue(pos, head, member) \
+ for (pos = list_entry(pos->member.next, typeof(*pos), member); \
+ prefetch(pos->member.next), &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos: the type * to use as a loop counter.
+ * @n: another type * to use as temporary storage
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#endif
+
Index: contrib/ofed/libmlx4/src/mlx4-abi.h
===================================================================
--- contrib/ofed/libmlx4/src/mlx4-abi.h
+++ contrib/ofed/libmlx4/src/mlx4-abi.h
@@ -35,14 +35,22 @@
#include <infiniband/kern-abi.h>
-#define MLX4_UVERBS_MIN_ABI_VERSION 2
+#define MLX4_UVERBS_MIN_ABI_VERSION 3
#define MLX4_UVERBS_MAX_ABI_VERSION 4
+enum {
+ MLX4_USER_DEV_CAP_64B_CQE = 1L << 0,
+#ifdef MLX4_WQE_FORMAT
+ MLX4_USER_DEV_CAP_WQE_FORMAT = 1L << 1
+#endif
+};
+
struct mlx4_alloc_ucontext_resp_v3 {
struct ibv_get_context_resp ibv_resp;
__u32 qp_tab_size;
__u16 bf_reg_size;
__u16 bf_regs_per_page;
+ __u32 cqe_size;
};
struct mlx4_alloc_ucontext_resp {
@@ -54,6 +62,14 @@
__u32 cqe_size;
};
+struct mlx4_alloc_ucontext_req {
+ struct ibv_get_context cmd;
+#ifdef MLX4_WQE_FORMAT
+ __u32 lib_caps;
+ __u32 reserved;
+#endif
+};
+
struct mlx4_alloc_pd_resp {
struct ibv_alloc_pd_resp ibv_resp;
__u32 pdn;
@@ -77,16 +93,14 @@
__u64 buf_addr;
};
-#ifdef HAVE_IBV_XRC_OPS
-struct mlx4_create_xrc_srq {
- struct ibv_create_xrc_srq ibv_cmd;
+struct mlx4_create_srq {
+ struct ibv_create_srq ibv_cmd;
__u64 buf_addr;
__u64 db_addr;
};
-#endif
-struct mlx4_create_srq {
- struct ibv_create_srq ibv_cmd;
+struct mlx4_create_xsrq {
+ struct ibv_create_xsrq ibv_cmd;
__u64 buf_addr;
__u64 db_addr;
};
@@ -97,8 +111,7 @@
__u32 reserved;
};
-struct mlx4_create_qp {
- struct ibv_create_qp ibv_cmd;
+struct mlx4_create_qp_base {
__u64 buf_addr;
__u64 db_addr;
__u8 log_sq_bb_count;
@@ -107,12 +120,14 @@
__u8 reserved[5];
};
-#ifdef HAVE_IBV_XRC_OPS
-struct mlx4_open_xrc_domain_resp {
- struct ibv_open_xrc_domain_resp ibv_resp;
- __u32 xrcdn;
- __u32 reserved;
+struct mlx4_exp_create_qp_provider {
+ struct mlx4_create_qp_base base;
+ __u64 uar_virt_add;
+};
+
+struct mlx4_create_qp {
+ struct ibv_create_qp ibv_cmd;
+ struct mlx4_create_qp_base base;
};
-#endif
#endif /* MLX4_ABI_H */
Index: contrib/ofed/libmlx4/src/mlx4.h
===================================================================
--- contrib/ofed/libmlx4/src/mlx4.h
+++ contrib/ofed/libmlx4/src/mlx4.h
@@ -34,10 +34,32 @@
#ifndef MLX4_H
#define MLX4_H
+#include <stdio.h>
#include <stddef.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
#include <infiniband/driver.h>
+#include <infiniband/driver_exp.h>
#include <infiniband/arch.h>
+#include <infiniband/verbs.h>
+#include <infiniband/verbs_exp.h>
+
+#define MLX4_MMAP_CMD_BITS 8
+#define MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD 2
+#define MLX4_IB_MMAP_GET_HW_CLOCK 3
+
+/* Use EXP mmap commands until it is pushed to upstream */
+#define MLX4_IB_EXP_MMAP_EXT_UAR_PAGE 0xFE
+#define MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE 0xFF
+
+#define MLX4_IB_MMAP_CMD_MASK 0xFF
+#define MLX4_CQ_PREFIX "MLX_CQ"
+#define MLX4_QP_PREFIX "MLX_QP"
+#define MLX4_MR_PREFIX "MLX_MR"
+#define MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE 23
+#define MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE 12
+#define MLX4_PORTS_NUM 2
#ifdef HAVE_VALGRIND_MEMCHECK_H
@@ -69,7 +91,7 @@
#if defined(__i386__)
#define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
-#elif defined(__x86_64__)
+#elif defined(__amd64__)
#define wc_wmb() asm volatile("sfence" ::: "memory")
#elif defined(__ia64__)
#define wc_wmb() asm volatile("fwb" ::: "memory")
@@ -79,29 +101,93 @@
#endif
-#ifndef HAVE_IBV_MORE_OPS
-#undef HAVE_IBV_XRC_OPS
-#undef HAVE_IBV_CREATE_QP_EXP
-#endif
-
#define HIDDEN __attribute__((visibility ("hidden")))
+#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if MLX4_GCC_VERSION >= 403
+# define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64)))
+# define __MLX4_ALGN_DATA__ __attribute__((aligned(64)))
+#else
+# define __MLX4_ALGN_FUNC__
+# define __MLX4_ALGN_DATA__
+#endif
+
#define PFX "mlx4: "
#ifndef max
-#define max(a,b) \
+#define max(a, b) \
({ typeof (a) _a = (a); \
typeof (b) _b = (b); \
_a > _b ? _a : _b; })
#endif
#ifndef min
-#define min(a,b) \
+#define min(a, b) \
({ typeof (a) _a = (a); \
typeof (b) _b = (b); \
_a < _b ? _a : _b; })
#endif
+#ifndef likely
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x),1)
+#else
+#define likely(x) (x)
+#endif
+#endif
+
+
+#ifndef unlikely
+#ifdef __GNUC__
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define unlikely(x) (x)
+#endif
+#endif
+
+#ifndef uninitialized_var
+#define uninitialized_var(x) x = x
+#endif
+
+#include "list.h"
+
+/****************************************/
+/* ioctl codes */
+/****************************************/
+#define MLX4_IOC_MAGIC 'm'
+#define MLX4_IOCHWCLOCKOFFSET _IOR(MLX4_IOC_MAGIC, 1, int)
+
+/* Generic macro to convert MLX4 to IBV flags. */
+#define MLX4_TRANSPOSE(val, from, to) \
+ (((from) >= (to)) ? \
+ (((val) & (from)) / ((from) / (to))) : \
+ (((val) & (from)) * ((to) / (from))))
+
+static inline uint64_t mlx4_transpose_uint16_t(uint16_t val, uint16_t from, uint64_t to)
+{
+ return MLX4_TRANSPOSE(val, from, to);
+}
+
+static inline uint64_t mlx4_transpose_uint32_t(uint32_t val, uint32_t from, uint64_t to)
+{
+ return MLX4_TRANSPOSE(val, from, to);
+}
+
+static inline uint32_t mlx4_transpose(uint32_t val, uint32_t from, uint32_t to)
+{
+ return MLX4_TRANSPOSE(val, from, to);
+}
+
+enum {
+ MLX4_MAX_FAMILY_VER = 0
+};
+
+enum {
+ MLX4_MAX_BFS_IN_PAGE = 8,
+ MLX4_BFS_STRIDE = 512,
+};
+
enum {
MLX4_STAT_RATE_OFFSET = 5
};
@@ -112,14 +198,86 @@
MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
};
+#define MLX4_REMOTE_SRQN_FLAGS(wr) htonl(wr->qp_type.xrc.remote_srqn << 8)
+
enum {
- MLX4_XRC_SRQ_TABLE_BITS = 8,
- MLX4_XRC_SRQ_TABLE_SIZE = 1 << MLX4_XRC_SRQ_TABLE_BITS,
- MLX4_XRC_SRQ_TABLE_MASK = MLX4_XRC_SRQ_TABLE_SIZE - 1
+ MLX4_XSRQ_TABLE_BITS = 8,
+ MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+ MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
};
enum {
- MLX4_XRC_QPN_BIT = (1 << 23)
+ MLX4_QP_PATTERN = 0x012389AB,
+ MLX4_CQ_PATTERN = 0x4567CDEF
+};
+
+enum mlx4_lock_type {
+ MLX4_SPIN_LOCK = 0,
+ MLX4_MUTEX = 1,
+};
+
+enum mlx4_lock_state {
+ MLX4_USE_LOCK,
+ MLX4_LOCKED,
+ MLX4_UNLOCKED
+};
+
+/* QP DoorBell ringing methods */
+enum mlx4_db_method {
+ MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB,/* QP has dedicated BF, */
+ /* only one thread is using this QP, */
+ /* the arch supports WC auto evict and */
+ /* prefer_bf flag is set. */
+ /* This means that there is no need for */
+ /* wc_wmb to flush the WC buffer */
+ MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB, /* Same as previous but prefer_bf */
+ /* flag is not set */
+ MLX4_QP_DB_METHOD_DEDIC_BF, /* QP has dedicated BF */
+ MLX4_QP_DB_METHOD_BF, /* QP has BF which may be shared with other QPs */
+ MLX4_QP_DB_METHOD_DB /* BF is not valid for this QP, use DoorBell to send the messages */
+};
+
+enum mlx4_res_domain_bf_type {
+ MLX4_RES_DOMAIN_BF_NONE, /* No BF for this resource domain */
+ MLX4_RES_DOMAIN_BF_SAFE, /* Use BF when possible */
+ MLX4_RES_DOMAIN_BF_UNSAFE, /* Use BF when possible. */
+ /* The application is responsible to sync between */
+ /* calls to objects using this resource domain. */
+ /* This means that there is no need to use the BF */
+ /* lock. */
+ MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT, /* Use BF when possible. */
+ /* Only one thread is using this resource */
+ /* and the arch supports WC auto-evict. */
+ /* This means that there is no need to use */
+ /* wc_wmb function to flush the BF buffer */
+
+};
+
+struct mlx4_xsrq_table {
+ struct {
+ struct mlx4_srq **table;
+ int refcnt;
+ } xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+
+ pthread_mutex_t mutex;
+ int num_xsrq;
+ int shift;
+ int mask;
+};
+
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+
+enum {
+ MLX4_XRC_QPN_BIT = (1 << 23)
+};
+
+enum qp_cap_cache {
+ /* The flag below includes VXLAN support as well in mlx4 HW*/
+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP = 1 << 1
};
enum mlx4_db_type {
@@ -128,6 +286,15 @@
MLX4_NUM_DB_TYPE
};
+enum mlx4_alloc_type {
+ MLX4_ALLOC_TYPE_ANON,
+ MLX4_ALLOC_TYPE_HUGE,
+ MLX4_ALLOC_TYPE_CONTIG,
+ MLX4_ALLOC_TYPE_PREFER_HUGE,
+ MLX4_ALLOC_TYPE_PREFER_CONTIG,
+ MLX4_ALLOC_TYPE_ALL
+};
+
enum {
MLX4_OPCODE_NOP = 0x00,
MLX4_OPCODE_SEND_INVAL = 0x01,
@@ -146,6 +313,12 @@
MLX4_OPCODE_LOCAL_INVAL = 0x1b,
MLX4_OPCODE_CONFIG_CMD = 0x1f,
+ MLX4_OPCODE_SEND_ENABLE = 0x17,
+ MLX4_OPCODE_RECV_ENABLE = 0x16,
+ MLX4_OPCODE_CQE_WAIT = 0x0f,
+ MLX4_OPCODE_CALC_SEND = 0x1e,
+ MLX4_OPCODE_CALC_RDMA_WRITE_IMM = 0x1f,
+
MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00,
MLX4_RECV_OPCODE_SEND = 0x01,
MLX4_RECV_OPCODE_SEND_IMM = 0x02,
@@ -155,28 +328,86 @@
MLX4_CQE_OPCODE_RESIZE = 0x16,
};
+extern int mlx4_stall_num_loop;
+extern int mlx4_trace;
+extern int mlx4_single_threaded;
+extern int mlx4_use_mutex;
+
enum {
MLX4_MAX_WQE_SIZE = 1008
};
struct mlx4_device {
- struct ibv_device ibv_dev;
+ struct verbs_device verbs_dev;
int page_size;
- int driver_abi_ver;
+
+ struct {
+ unsigned id;
+ unsigned short rev;
+ } devid;
+ int driver_abi_ver;
};
struct mlx4_db_page;
+struct mlx4_lock {
+ pthread_mutex_t mutex;
+ pthread_spinlock_t slock;
+ enum mlx4_lock_state state;
+ enum mlx4_lock_type type;
+};
+
+struct mlx4_spinlock {
+ pthread_spinlock_t lock;
+ enum mlx4_lock_state state;
+};
+
+/* struct for BF dedicated for one QP */
+struct mlx4_dedic_bf {
+ void *address;
+};
+
+/* struct for the common BF which may be shared by many QPs */
+struct mlx4_cmn_bf {
+ void *address;
+ /*
+ * Protect usage of BF address field including data written to the BF
+ * and the BF buffer toggling.
+ */
+ struct mlx4_lock lock;
+};
+
+union mlx4_bf {
+ struct mlx4_dedic_bf dedic;
+ struct mlx4_cmn_bf cmn;
+};
+
+struct mlx4_bfs_data {
+ struct mlx4_dedic_bf dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1];
+ struct mlx4_cmn_bf cmn_bf;
+ uint8_t dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1];
+ uint8_t dedic_bf_free;
+ struct mlx4_spinlock dedic_bf_lock; /* protect dedicated BFs managing */
+ /* including dedic_bf_used and */
+ /* dedic_bf_free fields */
+ void *page;
+ uint16_t buf_size;
+ uint8_t num_dedic_bfs;
+};
+
struct mlx4_context {
- struct ibv_context ibv_ctx;
+ union {
+ struct ibv_context ibv_ctx;
+ };
+ struct mlx4_spinlock send_db_lock; /* protects send_db_list and send_db_num_uars */
+ struct list_head send_db_list;
+ unsigned int send_db_num_uars;
void *uar;
- pthread_spinlock_t uar_lock;
-
- void *bf_page;
- int bf_buf_size;
- int bf_offset;
- pthread_spinlock_t bf_lock;
+ struct mlx4_spinlock uar_lock;
+ struct mlx4_bfs_data bfs;
+ int bf_regs_per_page;
+ int max_ctx_res_domain;
struct {
struct mlx4_qp **table;
@@ -189,24 +420,39 @@
int max_qp_wr;
int max_sge;
int max_cqe;
- int cqe_size;
-
+ uint64_t exp_device_cap_flags;
struct {
- struct mlx4_srq **table;
- int refcnt;
- } xrc_srq_table[MLX4_XRC_SRQ_TABLE_SIZE];
- pthread_mutex_t xrc_srq_table_mutex;
- int num_xrc_srqs;
- int xrc_srq_table_shift;
- int xrc_srq_table_mask;
+ int offset;
+ int mult;
+ int shift;
+ uint64_t mask;
+ } core_clk;
+ void *hca_core_clock;
+
+ struct mlx4_xsrq_table xsrq_table;
struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE];
pthread_mutex_t db_list_mutex;
+ int cqe_size;
+ int prefer_bf;
+ struct mlx4_spinlock hugetlb_lock;
+ struct list_head hugetlb_list;
+ int stall_enable;
+ pthread_mutex_t task_mutex;
+ struct {
+ uint8_t valid;
+ uint8_t link_layer;
+ enum ibv_port_cap_flags caps;
+ } port_query_cache[MLX4_PORTS_NUM];
+ pthread_mutex_t env_mtx;
+ int env_initialized;
};
struct mlx4_buf {
void *buf;
+ void *hmem;
size_t length;
+ int base;
};
struct mlx4_pd {
@@ -214,23 +460,40 @@
uint32_t pdn;
};
+enum mlx4_cq_model_flags {
+ /*
+ * When set the CQ API must be thread safe.
+ * When reset application is taking care
+ * to sync between CQ API calls.
+ */
+ MLX4_CQ_MODEL_FLAG_THREAD_SAFE = 1 << 0,
+};
+
struct mlx4_cq {
- struct ibv_cq ibv_cq;
+ struct ibv_cq ibv_cq __MLX4_ALGN_DATA__;
+ uint32_t pattern;
struct mlx4_buf buf;
struct mlx4_buf resize_buf;
- pthread_spinlock_t lock;
+ struct mlx4_lock lock;
uint32_t cqn;
uint32_t cons_index;
+ uint32_t wait_index;
+ uint32_t wait_count;
uint32_t *set_ci_db;
uint32_t *arm_db;
int arm_sn;
- int cqe_size;
+ int stall_next_poll;
+ int stall_enable;
+ int cqe_size;
+ int creation_flags;
+ struct mlx4_qp *last_qp;
+ uint32_t model_flags; /* use mlx4_cq_model_flags */
};
struct mlx4_srq {
- struct ibv_srq ibv_srq;
+ struct verbs_srq verbs_srq;
struct mlx4_buf buf;
- pthread_spinlock_t lock;
+ struct mlx4_spinlock lock;
uint64_t *wrid;
uint32_t srqn;
int max;
@@ -240,33 +503,102 @@
int tail;
uint32_t *db;
uint16_t counter;
+ uint8_t ext_srq;
+ struct ibv_srq_legacy *ibv_srq_legacy;
};
struct mlx4_wq {
uint64_t *wrid;
- pthread_spinlock_t lock;
+ struct mlx4_lock lock;
int wqe_cnt;
int max_post;
+ char *buf;
unsigned head;
unsigned tail;
int max_gs;
int wqe_shift;
- int offset;
+
+ /* SEND/RECV_ENABLE data */
+ unsigned head_en_index;
+ unsigned head_en_count;
+};
+
+/* enclosing ibv_mr adding some extra managing information */
+struct mlx4_mr {
+ struct ibv_mr ibv_mr;
+ struct mlx4_buf buf;
+ uint64_t allocation_flags;
+ int shared_mr;
+};
+
+
+struct mlx4_inlr_rbuff {
+ void *rbuff;
+ int rlen;
+};
+
+struct mlx4_inlr_sg_list {
+ struct mlx4_inlr_rbuff *sg_list;
+ int list_len;
+};
+
+struct mlx4_inlr_buff {
+ struct mlx4_inlr_sg_list *buff;
+ int len;
+};
+
+struct mlx4_send_db_data {
+ union mlx4_bf bf;
+ uint32_t *db_addr; /* Points to the BF related send DB */
+ struct list_head list;
+};
+
+enum mlx4_qp_model_flags {
+ /*
+ * When set the QP API must be thread safe.
+ * When reset application is taking care
+ * to sync between QP API calls.
+ */
+ MLX4_QP_MODEL_FLAG_THREAD_SAFE = 1 << 0,
};
struct mlx4_qp {
- struct ibv_qp ibv_qp;
- struct mlx4_buf buf;
- int max_inline_data;
+ struct verbs_qp verbs_qp;
+ uint32_t pattern;
int buf_size;
-
+ uint32_t model_flags; /* use mlx4_qp_model_flags */
+
+ /* hot post send data */
+ struct mlx4_wq sq __MLX4_ALGN_DATA__;
+ int (*post_send_one)(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe, int *total_size,
+ int *inl, unsigned int ind);
+ union mlx4_bf *bf;
+ uint32_t *sdb; /* send DB */
+ struct mlx4_buf buf;
+ unsigned last_db_head;
uint32_t doorbell_qpn;
- uint32_t sq_signal_bits;
- int sq_spare_wqes;
- struct mlx4_wq sq;
-
+ uint32_t create_flags;
+ uint16_t max_inline_data;
+ uint16_t bf_buf_size;
+ uint16_t sq_spare_wqes;
+ uint8_t srcrb_flags_tbl[16];
+ uint8_t db_method;
+ uint8_t qp_type;
+ /* RAW_PACKET hot data */
+ uint8_t link_layer;
+ /* EXT_MASKED_ATOMIC hot data */
+ uint8_t is_masked_atomic;
+
+ /* post receive hot data */
+ struct mlx4_wq rq __MLX4_ALGN_DATA__;
uint32_t *db;
- struct mlx4_wq rq;
+ uint32_t max_inlr_sg;
+ int32_t cached_rx_csum_flags;
+ int32_t transposed_rx_csum_flags;
+ struct mlx4_inlr_buff inlr_buff;
+ uint8_t qp_cap_cache;
};
struct mlx4_av {
@@ -280,7 +612,6 @@
uint8_t hop_limit;
uint32_t sl_tclass_flowlabel;
uint8_t dgid[16];
- uint8_t mac[8];
};
struct mlx4_ah {
@@ -288,18 +619,20 @@
struct mlx4_av av;
uint16_t vlan;
uint8_t mac[6];
- uint8_t tagged;
};
-struct mlx4_xrc_domain {
- struct ibv_xrc_domain ibv_xrcd;
- uint32_t xrcdn;
+struct mlx4_res_domain {
+ struct ibv_exp_res_domain ibv_res_domain;
+ struct ibv_exp_res_domain_init_attr attr;
+ enum mlx4_res_domain_bf_type type;
+ struct mlx4_send_db_data *send_db;
};
static inline unsigned long align(unsigned long val, unsigned long align)
{
return (val + align - 1) & ~(align - 1);
}
+int align_queue_size(int req);
#define to_mxxx(xxx, type) \
((struct mlx4_##type *) \
@@ -307,7 +640,10 @@
static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev)
{
- return to_mxxx(dev, device);
+ /* ibv_device is first field of verbs_device
+ * see try_driver in libibverbs.
+ */
+ return container_of(ibdev, struct mlx4_device, verbs_dev);
}
static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
@@ -327,32 +663,53 @@
static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq)
{
- return to_mxxx(srq, srq);
+ return container_of(container_of(ibsrq, struct verbs_srq, srq),
+ struct mlx4_srq, verbs_srq);
}
static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
{
- return to_mxxx(qp, qp);
+ return container_of(container_of(ibqp, struct verbs_qp, qp),
+ struct mlx4_qp, verbs_qp);
}
+static inline struct mlx4_mr *to_mmr(struct ibv_mr *ibmr)
+{
+ return to_mxxx(mr, mr);
+}
static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
{
return to_mxxx(ah, ah);
}
-#ifdef HAVE_IBV_XRC_OPS
-static inline struct mlx4_xrc_domain *to_mxrcd(struct ibv_xrc_domain *ibxrcd)
+static inline struct mlx4_res_domain *to_mres_domain(struct ibv_exp_res_domain *ibres_domain)
{
- return to_mxxx(xrcd, xrc_domain);
+ return to_mxxx(res_domain, res_domain);
}
-#endif
+int update_port_data(struct ibv_qp *qp, uint8_t port_num);
int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
void mlx4_free_buf(struct mlx4_buf *buf);
+int mlx4_alloc_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf,
+ size_t size, int page_size);
+int mlx4_alloc_buf_contig(struct mlx4_context *mctx, struct mlx4_buf *buf,
+ size_t size, int page_size, const char *component, void *req_addr);
+int mlx4_alloc_prefered_buf(struct mlx4_context *mctx,
+ struct mlx4_buf *buf,
+ size_t size, int page_size,
+ enum mlx4_alloc_type alloc_type,
+ const char *component);
+void mlx4_get_alloc_type(struct ibv_context *context, const char *component,
+ enum mlx4_alloc_type *alloc_type,
+ enum mlx4_alloc_type default_alloc_type);
+void mlx4_free_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf);
+int mlx4_use_huge(struct ibv_context *context, const char *key);
uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db);
+int __mlx4_query_device(uint64_t raw_fw_ver,
+ struct ibv_device_attr *attr);
int mlx4_query_device(struct ibv_context *context,
struct ibv_device_attr *attr);
int mlx4_query_port(struct ibv_context *context, uint8_t port,
@@ -360,19 +717,42 @@
struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context);
int mlx4_free_pd(struct ibv_pd *pd);
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+ struct ibv_xrcd_init_attr *attr);
+int mlx4_close_xrcd(struct ibv_xrcd *xrcd);
struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
- size_t length, enum ibv_access_flags access);
+ size_t length, int access);
+struct ibv_mr *mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in *in);
+int mlx4_exp_post_send(struct ibv_qp *ibqp, struct ibv_exp_send_wr *wr,
+ struct ibv_exp_send_wr **bad_wr);
+void mlx4_update_post_send_one(struct mlx4_qp *qp);
+struct ibv_exp_qp_burst_family *mlx4_get_qp_burst_family(struct mlx4_qp *qp,
+ struct ibv_exp_query_intf_params *params,
+ enum ibv_exp_query_intf_status *status);
+struct ibv_exp_cq_family *mlx4_get_poll_cq_family(struct mlx4_cq *cq,
+ struct ibv_exp_query_intf_params *params,
+ enum ibv_exp_query_intf_status *status);
+
+struct ibv_mr *mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in);
int mlx4_dereg_mr(struct ibv_mr *mr);
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type);
+int mlx4_dealloc_mw(struct ibv_mw *mw);
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+ struct ibv_mw_bind *mw_bind);
+int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind);
+
struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel,
int comp_vector);
-int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+int mlx4_alloc_cq_buf(struct mlx4_context *mctx, struct mlx4_buf *buf, int nent,
int entry_size);
int mlx4_resize_cq(struct ibv_cq *cq, int cqe);
int mlx4_destroy_cq(struct ibv_cq *cq);
-int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_poll_ibv_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries,
+ struct ibv_exp_wc *wc, uint32_t wc_size) __MLX4_ALGN_FUNC__;
int mlx4_arm_cq(struct ibv_cq *cq, int solicited);
void mlx4_cq_event(struct ibv_cq *cq);
void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
@@ -382,76 +762,207 @@
struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
struct ibv_srq_init_attr *attr);
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex);
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex);
int mlx4_modify_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr,
- enum ibv_srq_attr_mask mask);
+ int mask);
int mlx4_query_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr);
int mlx4_destroy_srq(struct ibv_srq *srq);
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq);
int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
struct mlx4_srq *srq);
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
struct ibv_recv_wr *wr,
struct ibv_recv_wr **bad_wr);
-struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
-int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
- struct mlx4_srq *srq);
-void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+ struct ibv_qp_init_attr_ex *attr);
+int mlx4_modify_cq(struct ibv_cq *cq, struct ibv_exp_cq_attr *attr, int attr_mask);
+int mlx4_post_task(struct ibv_context *context,
+ struct ibv_exp_task *task_list,
+ struct ibv_exp_task **bad_task);
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr);
int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
- enum ibv_qp_attr_mask attr_mask,
+ int attr_mask,
struct ibv_qp_init_attr *init_attr);
int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
- enum ibv_qp_attr_mask attr_mask);
+ int attr_mask);
+int mlx4_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr,
+ uint64_t attr_mask);
int mlx4_destroy_qp(struct ibv_qp *qp);
+void *mlx4_get_recv_wqe(struct mlx4_qp *qp, int n);
void mlx4_init_qp_indices(struct mlx4_qp *qp);
void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp);
int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
- struct ibv_send_wr **bad_wr);
+ struct ibv_send_wr **bad_wr) __MLX4_ALGN_FUNC__;
int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
- struct ibv_recv_wr **bad_wr);
+ struct ibv_recv_wr **bad_wr) __MLX4_ALGN_FUNC__;
void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
struct mlx4_qp *qp);
int num_inline_segs(int data, enum ibv_qp_type type);
-int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
- enum ibv_qp_type type, struct mlx4_qp *qp);
+void mlx4_dealloc_qp_buf(struct ibv_context *context, struct mlx4_qp *qp);
void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
enum ibv_qp_type type);
struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn);
int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp);
void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn);
+struct ibv_ah *mlx4_create_ah_common(struct ibv_pd *pd,
+ struct ibv_ah_attr *attr,
+ uint8_t link_layer);
struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+struct ibv_ah *mlx4_exp_create_ah(struct ibv_pd *pd,
+ struct ibv_exp_ah_attr *attr_ex);
int mlx4_destroy_ah(struct ibv_ah *ah);
int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
struct mlx4_ah *ah);
void mlx4_free_av(struct mlx4_ah *ah);
-#ifdef HAVE_IBV_XRC_OPS
-struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
- struct ibv_xrc_domain *xrc_domain,
- struct ibv_cq *xrc_cq,
- struct ibv_srq_init_attr *attr);
-struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
- int fd, int oflag);
-
-int mlx4_close_xrc_domain(struct ibv_xrc_domain *d);
-int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr,
- uint32_t *xrc_qp_num);
-int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num,
- struct ibv_qp_attr *attr,
- int attr_mask);
-int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num,
- struct ibv_qp_attr *attr,
- int attr_mask,
- struct ibv_qp_init_attr *init_attr);
-int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num);
-int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num);
-#endif
+struct ibv_cq *mlx4_create_cq_ex(struct ibv_context *context,
+ int cqe,
+ struct ibv_comp_channel *channel,
+ int comp_vector,
+ struct ibv_exp_cq_init_attr *attr);
+int mlx4_query_values(struct ibv_context *context, int q_values,
+ struct ibv_exp_values *values);
+void *mlx4_get_legacy_xrc(struct ibv_srq *srq);
+void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq);
+void read_init_vars(struct mlx4_context *ctx);
+
+static inline enum mlx4_lock_type mlx4_get_locktype(void)
+{
+ if (!mlx4_use_mutex)
+ return MLX4_SPIN_LOCK;
+
+ return MLX4_MUTEX;
+}
+
+static inline int mlx4_spin_lock(struct mlx4_spinlock *lock)
+{
+ if (lock->state == MLX4_USE_LOCK)
+ return pthread_spin_lock(&lock->lock);
+
+ if (unlikely(lock->state == MLX4_LOCKED)) {
+ fprintf(stderr, "*** ERROR: multithreading violation ***\n"
+ "You are running a multithreaded application but\n"
+ "you set MLX4_SINGLE_THREADED=1. Please unset it.\n");
+ abort();
+ } else {
+ lock->state = MLX4_LOCKED;
+ wmb();
+ }
+
+ return 0;
+}
+
+static inline int mlx4_spin_unlock(struct mlx4_spinlock *lock)
+{
+ if (lock->state == MLX4_USE_LOCK)
+ return pthread_spin_unlock(&lock->lock);
+
+ lock->state = MLX4_UNLOCKED;
+
+ return 0;
+}
+
+static inline int mlx4_lock(struct mlx4_lock *lock)
+{
+ if (lock->state == MLX4_USE_LOCK) {
+ if (lock->type == MLX4_SPIN_LOCK)
+ return pthread_spin_lock(&lock->slock);
+
+ return pthread_mutex_lock(&lock->mutex);
+ }
+
+ if (unlikely(lock->state == MLX4_LOCKED)) {
+ fprintf(stderr, "*** ERROR: multithreading violation ***\n"
+ "You are running a multithreaded application but\n"
+ "you set MLX4_SINGLE_THREADED=1. Please unset it.\n");
+ abort();
+ } else {
+ lock->state = MLX4_LOCKED;
+ /* Make new state visable to other threads. */
+ wmb();
+ }
+
+ return 0;
+}
+
+static inline int mlx4_unlock(struct mlx4_lock *lock)
+{
+ if (lock->state == MLX4_USE_LOCK) {
+ if (lock->type == MLX4_SPIN_LOCK)
+ return pthread_spin_unlock(&lock->slock);
+
+ return pthread_mutex_unlock(&lock->mutex);
+ }
+ lock->state = MLX4_UNLOCKED;
+
+ return 0;
+}
+static inline int mlx4_spinlock_init(struct mlx4_spinlock *lock, int use_spinlock)
+{
+ if (use_spinlock) {
+ lock->state = MLX4_USE_LOCK;
+ return pthread_spin_init(&lock->lock, PTHREAD_PROCESS_PRIVATE);
+ }
+ lock->state = MLX4_UNLOCKED;
+
+ return 0;
+}
+
+static inline int mlx4_spinlock_destroy(struct mlx4_spinlock *lock)
+{
+ if (lock->state == MLX4_USE_LOCK)
+ return pthread_spin_destroy(&lock->lock);
+
+ return 0;
+}
+
+static inline int mlx4_lock_init(struct mlx4_lock *lock,
+ int use_lock,
+ enum mlx4_lock_type lock_type)
+{
+ if (use_lock) {
+ lock->type = lock_type;
+ lock->state = MLX4_USE_LOCK;
+ if (lock->type == MLX4_SPIN_LOCK)
+ return pthread_spin_init(&lock->slock,
+ PTHREAD_PROCESS_PRIVATE);
+
+ return pthread_mutex_init(&lock->mutex,
+ PTHREAD_PROCESS_PRIVATE);
+ }
+ lock->state = MLX4_UNLOCKED;
+
+ return 0;
+}
+
+static inline int mlx4_lock_destroy(struct mlx4_lock *lock)
+{
+ if (lock->state == MLX4_USE_LOCK) {
+ if (lock->type == MLX4_SPIN_LOCK)
+ return pthread_spin_destroy(&lock->slock);
+
+ return pthread_mutex_destroy(&lock->mutex);
+ }
+
+ return 0;
+}
+
+static inline void mlx4_update_cons_index(struct mlx4_cq *cq)
+{
+ *cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
+}
#endif /* MLX4_H */
Index: contrib/ofed/libmlx4/src/mlx4.c
===================================================================
--- contrib/ofed/libmlx4/src/mlx4.c
+++ contrib/ofed/libmlx4/src/mlx4.c
@@ -41,18 +41,27 @@
#include <sys/mman.h>
#include <pthread.h>
#include <string.h>
-
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sched.h>
#ifndef HAVE_IBV_REGISTER_DRIVER
#include <sysfs/libsysfs.h>
#endif
+#include <sys/cpuset.h>
#include "mlx4.h"
#include "mlx4-abi.h"
+#include "mlx4_exp.h"
+
#ifndef PCI_VENDOR_ID_MELLANOX
#define PCI_VENDOR_ID_MELLANOX 0x15b3
#endif
+int mlx4_trace = 0;
+int mlx4_single_threaded = 0;
+int mlx4_use_mutex = 0;
+
#define HCA(v, d) \
{ .vendor = PCI_VENDOR_ID_##v, \
.device = d }
@@ -66,47 +75,30 @@
HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */
HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */
HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */
- HCA(MELLANOX, 0x6368), /* MT25448 [ConnectX EN 10GigE, PCIe 2.0 2.5GT/s] */
- HCA(MELLANOX, 0x6750), /* MT26448 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */
- HCA(MELLANOX, 0x6372), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe 2.0 2.5GT/s] */
- HCA(MELLANOX, 0x675a), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe Gen2 5GT/s] */
- HCA(MELLANOX, 0x6764), /* MT26468 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */
- HCA(MELLANOX, 0x6746), /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
- HCA(MELLANOX, 0x676e), /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
- HCA(MELLANOX, 0x6778), /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
- HCA(MELLANOX, 0x1000),
- HCA(MELLANOX, 0x1001),
- HCA(MELLANOX, 0x1002),
- HCA(MELLANOX, 0x1003),
- HCA(MELLANOX, 0x1004),
- HCA(MELLANOX, 0x1005),
- HCA(MELLANOX, 0x1006),
- HCA(MELLANOX, 0x1007),
- HCA(MELLANOX, 0x1008),
- HCA(MELLANOX, 0x1009),
- HCA(MELLANOX, 0x100a),
- HCA(MELLANOX, 0x100b),
- HCA(MELLANOX, 0x100c),
- HCA(MELLANOX, 0x100d),
- HCA(MELLANOX, 0x100e),
- HCA(MELLANOX, 0x100f),
+ HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */
+ HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+ HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */
+ HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+ HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/
+ HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+ HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */
+ HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */
+ HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */
+ HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */
+ HCA(MELLANOX, 0x1005), /* MT27510 Family */
+ HCA(MELLANOX, 0x1006), /* MT27511 Family */
+ HCA(MELLANOX, 0x1007), /* MT27520 Family */
+ HCA(MELLANOX, 0x1008), /* MT27521 Family */
+ HCA(MELLANOX, 0x1009), /* MT27530 Family */
+ HCA(MELLANOX, 0x100a), /* MT27531 Family */
+ HCA(MELLANOX, 0x100b), /* MT27540 Family */
+ HCA(MELLANOX, 0x100c), /* MT27541 Family */
+ HCA(MELLANOX, 0x100d), /* MT27550 Family */
+ HCA(MELLANOX, 0x100e), /* MT27551 Family */
+ HCA(MELLANOX, 0x100f), /* MT27560 Family */
+ HCA(MELLANOX, 0x1010), /* MT27561 Family */
};
-#ifdef HAVE_IBV_MORE_OPS
-static struct ibv_more_ops mlx4_more_ops = {
-#ifdef HAVE_IBV_XRC_OPS
- .create_xrc_srq = mlx4_create_xrc_srq,
- .open_xrc_domain = mlx4_open_xrc_domain,
- .close_xrc_domain = mlx4_close_xrc_domain,
- .create_xrc_rcv_qp = mlx4_create_xrc_rcv_qp,
- .modify_xrc_rcv_qp = mlx4_modify_xrc_rcv_qp,
- .query_xrc_rcv_qp = mlx4_query_xrc_rcv_qp,
- .reg_xrc_rcv_qp = mlx4_reg_xrc_rcv_qp,
- .unreg_xrc_rcv_qp = mlx4_unreg_xrc_rcv_qp,
-#endif
-};
-#endif
-
static struct ibv_context_ops mlx4_ctx_ops = {
.query_device = mlx4_query_device,
.query_port = mlx4_query_port,
@@ -114,8 +106,11 @@
.dealloc_pd = mlx4_free_pd,
.reg_mr = mlx4_reg_mr,
.dereg_mr = mlx4_dereg_mr,
+ .alloc_mw = mlx4_alloc_mw,
+ .dealloc_mw = mlx4_dealloc_mw,
+ .bind_mw = mlx4_bind_mw,
.create_cq = mlx4_create_cq,
- .poll_cq = mlx4_poll_cq,
+ .poll_cq = mlx4_poll_ibv_cq,
.req_notify_cq = mlx4_arm_cq,
.cq_event = mlx4_cq_event,
.resize_cq = mlx4_resize_cq,
@@ -137,150 +132,592 @@
.detach_mcast = ibv_cmd_detach_mcast
};
-static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, int cmd_fd)
+static int read_number_from_line(const char *line, int *value)
{
- struct mlx4_context *context;
- struct ibv_get_context cmd;
- struct mlx4_alloc_ucontext_resp resp;
- struct mlx4_alloc_ucontext_resp_v3 resp_v3;
- int i;
- struct ibv_device_attr dev_attrs;
- unsigned int bf_reg_size;
+ const char *ptr;
- context = calloc(1, sizeof *context);
- if (!context)
- return NULL;
+ ptr = strchr(line, ':');
+ if (!ptr)
+ return 1;
+
+ ++ptr;
+
+ *value = atoi(ptr);
+ return 0;
+}
+
+static int mlx4_is_sandy_bridge(int *num_cores)
+{
+ char line[128];
+ FILE *fd;
+ int rc = 0;
+ int cur_cpu_family = -1;
+ int cur_cpu_model = -1;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (!fd)
+ return 0;
+
+ *num_cores = 0;
+
+ while (fgets(line, 128, fd)) {
+ int value;
+
+ /* if this is information on new processor */
+ if (!strncmp(line, "processor", 9)) {
+ ++*num_cores;
+
+ cur_cpu_family = -1;
+ cur_cpu_model = -1;
+ } else if (!strncmp(line, "cpu family", 10)) {
+ if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
+ cur_cpu_family = value;
+ } else if (!strncmp(line, "model", 5)) {
+ if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
+ cur_cpu_model = value;
+ }
+
+ /* if this is a Sandy Bridge CPU */
+ if ((cur_cpu_family == 6) &&
+ (cur_cpu_model == 0x2A || cur_cpu_model == 0x2D))
+ rc = 1;
+ }
+
+ fclose(fd);
+ return rc;
+}
+
+static void mlx4_check_numa_enabled(struct ibv_context *context)
+{
+ char fname[MAXPATHLEN];
+ char buf[128];
+ FILE *fp;
+ int numa_enabled;
+ char env[VERBS_MAX_ENV_VAL];
+
+ snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/numa_node",
+ ibv_get_device_name(context->device));
+
+ fp = fopen(fname, "r");
+ if (!fp) {
+ fprintf(stderr, PFX "Warning: can not check if NUMA is enabled "
+ "on node: failed to open %s\n", fname);
+ return;
+ }
+
+ if (!fgets(buf, sizeof(buf), fp)) {
+ fprintf(stderr, PFX "Warning: can not check if NUMA is enabled "
+ "on node: failed to read numa node value\n");
+ goto out;
+ }
+
+ numa_enabled = (strtol(buf, 0, 10) >= 0);
+ if (numa_enabled)
+ printf(PFX "Device NUMA node detection is supported\n");
+ else if (ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env, sizeof(env)))
+ printf(PFX "Warning: Device NUMA node detection is not supported. "
+ "Please consider setting the environment variable "
+ "'MLX4_LOCAL_CPUS' or enable ACPI SLIT\n");
+out:
+ fclose(fp);
+}
+
+static void dump_cpu_set(cpuset_t *cpu_set)
+{
+ int i;
+ int first_cpu = -1;
+ int last_cpu = -1;
+ int n = 0;
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, cpu_set)) {
+ if (first_cpu < 0)
+ first_cpu = i;
+ if (i == CPU_SETSIZE - 1)
+ last_cpu = i;
+ } else if (first_cpu >= 0)
+ last_cpu = i - 1;
+
+ if (last_cpu >= 0) {
+ if (first_cpu != last_cpu)
+ printf("%s%d-%d", n ? "," : "", first_cpu,
+ last_cpu);
+ else
+ printf("%s%d", n ? "," : "", last_cpu);
+
+ first_cpu = -1;
+ last_cpu = -1;
+ ++n;
+ }
+ }
+}
+
+/*
+man cpuset
+
+ This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
+ are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
+ words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
+ within a word are also in big-endian order.
+
+ The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
+ the size of the bitmask.
+
+ Examples of the Mask Format:
+
+ 00000001 # just bit 0 set
+ 40000000,00000000,00000000 # just bit 94 set
+ 000000ff,00000000 # bits 32-39 set
+ 00000000,000E3862 # 1,5,6,11-13,17-19 set
+
+ A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
+
+ 00000001,00000001,00010117
+
+ The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
+ bit 4, and the "7" is for bits 2, 1, and 0.
+*/
+static void mlx4_local_cpu_set(struct ibv_context *context, cpuset_t *cpu_set)
+{
+ char *p, buf[1024];
+ char env_value[VERBS_MAX_ENV_VAL];
+ uint32_t word;
+ int i, k;
+
+ if (mlx4_trace)
+ mlx4_check_numa_enabled(context);
+
+ if (!ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env_value, sizeof(env_value))) {
+ strncpy(buf, env_value, sizeof(buf));
+ if (mlx4_trace)
+ printf(PFX "Local CPUs flags were override by %s\n", buf);
+ } else {
+ char fname[MAXPATHLEN];
+ FILE *fp;
+
+ snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/local_cpus",
+ ibv_get_device_name(context->device));
+
+ fp = fopen(fname, "r");
+ if (!fp) {
+ fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
+ return;
+ }
+ if (!fgets(buf, sizeof(buf), fp)) {
+ fprintf(stderr, PFX "Warning: can not get local cpu set: failed to read cpu mask\n");
+ fclose(fp);
+ return;
+ }
+ fclose(fp);
+ }
- context->ibv_ctx.cmd_fd = cmd_fd;
+ p = strrchr(buf, ',');
+ if (!p)
+ p = buf;
- if (to_mdev(ibdev)->driver_abi_ver > 3) {
- if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
- &resp.ibv_resp, sizeof resp))
- goto err_free;
+ i = 0;
+ do {
+ if (*p == ',') {
+ *p = 0;
+ p ++;
+ }
+
+ word = strtoul(p, 0, 16);
+
+ for (k = 0; word; ++k, word >>= 1)
+ if (word & 1)
+ CPU_SET(k+i, cpu_set);
+
+ if (p == buf)
+ break;
+
+ p = strrchr(buf, ',');
+ if (!p)
+ p = buf;
+
+ i += 32;
+ } while (i < CPU_SETSIZE);
+}
+
+static int mlx4_enable_sandy_bridge_fix(struct ibv_context *context)
+{
+ cpuset_t my_cpus, dev_local_cpus, result_set;
+ int stall_enable;
+ int ret;
+ int num_cores;
+
+ if (!mlx4_is_sandy_bridge(&num_cores))
+ return 0;
+
+ /* by default disable stall on sandy bridge arch */
+ stall_enable = 0;
+
+ /*
+ * check if app is bound to cpu set that is inside
+ * of device local cpu set. Disable stalling if true
+ */
+
+ /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
+ CPU_ZERO(&my_cpus);
+ CPU_ZERO(&dev_local_cpus);
+ CPU_ZERO(&result_set);
+ ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
+ sizeof(my_cpus), &my_cpus);
+ if (ret == -1) {
+ if (errno == EINVAL)
+ fprintf(stderr, PFX "Warning: my cpu set is too small\n");
+ else
+ fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
+ goto out;
+ }
+
+ if (mlx4_trace) {
+ printf(PFX "Running on cpus: ");
+ dump_cpu_set(&my_cpus);
+ printf("\n");
+ }
+
+ /* get device local cpu set */
+ mlx4_local_cpu_set(context, &dev_local_cpus);
+
+ /* make sure result_set is not init to all 0 */
+ CPU_SET(0, &result_set);
+ /* Set stall_enable if my cpu set and dev cpu set are disjoint sets */
+ CPU_AND(&result_set, &my_cpus);
+ CPU_AND(&result_set, &dev_local_cpus);
+ stall_enable = CPU_COUNT(&result_set) ? 0 : 1;
+
+ if (mlx4_trace) {
+ printf(PFX "HCA:%s local cpus: ", ibv_get_device_name(context->device));
+ dump_cpu_set(&dev_local_cpus);
+ printf("\n");
+ if (CPU_COUNT(&my_cpus) == num_cores) {
+ printf(PFX "Warning: CPU affinity wasn't used for this "
+ "process, if the system has more than one numa node, it might be using a remote one.\n");
+ printf(PFX " For achieving better performance, "
+ "please consider setting the CPU "
+ "affinity.\n");
+ }
+ }
+
+out:
+ if (mlx4_trace)
+ printf(PFX "Sandy Bridge CPU was detected, cq_stall is %s\n",
+ stall_enable ? "enabled" : "disabled");
+
+ return stall_enable;
+}
+
+static void mlx4_read_env(struct ibv_device *ibdev, struct mlx4_context *ctx)
+{
+ char env_value[VERBS_MAX_ENV_VAL];
+
+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_TRACE", env_value, sizeof(env_value)) &&
+ (strcmp(env_value, "0")))
+ mlx4_trace = 1;
+
+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_CQ_POLL", env_value, sizeof(env_value)) &&
+ !strcmp(env_value, "0"))
+ /* check if cq stall is overrided by user */
+ ctx->stall_enable = 0;
+ else
+ /* autodetect if we need to do cq polling */
+ ctx->stall_enable = mlx4_enable_sandy_bridge_fix(&ctx->ibv_ctx);
+
+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_NUM_LOOP", env_value, sizeof(env_value)))
+ mlx4_stall_num_loop = atoi(env_value);
+
+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_SINGLE_THREADED", env_value, sizeof(env_value)))
+ mlx4_single_threaded = strcmp(env_value, "1") ? 0 : 1;
+
+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx,
+ "MLX4_USE_MUTEX",
+ env_value,
+ sizeof(env_value)))
+ mlx4_use_mutex = strcmp(env_value, "1") ? 0 : 1;
+}
+
+void read_init_vars(struct mlx4_context *ctx)
+{
+ char env_value[VERBS_MAX_ENV_VAL];
+
+ pthread_mutex_lock(&ctx->env_mtx);
+ if (!ctx->env_initialized) {
+ mlx4_read_env(ctx->ibv_ctx.device, ctx);
+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_POST_SEND_PREFER_BF", env_value, sizeof(env_value))) {
+ ctx->prefer_bf = !!strcmp(env_value, "0");
+ if (mlx4_trace)
+ printf(PFX "prefer_bf=%d\n", ctx->prefer_bf);
+ } else {
+ ctx->prefer_bf = 1;
+ }
- context->num_qps = resp.qp_tab_size;
- context->num_xrc_srqs = resp.qp_tab_size;
- bf_reg_size = resp.bf_reg_size;
- context->cqe_size = resp.cqe_size;
+ ctx->env_initialized = 1;
+ }
+ pthread_mutex_unlock(&ctx->env_mtx);
+}
+
+static int mlx4_init_context(struct verbs_device *v_device,
+ struct ibv_context *ibv_ctx, int cmd_fd)
+{
+ struct mlx4_context *context;
+ struct mlx4_alloc_ucontext_req req;
+ struct mlx4_alloc_ucontext_resp resp;
+ struct mlx4_alloc_ucontext_resp_v3 resp_v3;
+ int i;
+ struct ibv_exp_device_attr dev_attrs;
+ struct ibv_device_attr dev_legacy_attrs;
+ struct mlx4_device *dev = to_mdev(&v_device->device);
+ unsigned int qp_tab_size;
+ unsigned int bf_reg_size;
+ unsigned int cqe_size;
+ int hca_clock_offset;
+ void *hca_clock_page = NULL;
+
+ /* verbs_context should be used for new verbs.
+ * memory footprint of mlx4_context and verbs_context share
+ * struct ibv_context.
+ */
+ struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
+ struct verbs_context_exp *verbs_exp_ctx = verbs_get_exp_ctx(ibv_ctx);
+
+ memset(&req, 0, sizeof(req));
+ context = to_mctx(ibv_ctx);
+ ibv_ctx->cmd_fd = cmd_fd;
+ ibv_ctx->device = &v_device->device;
+
+ if (pthread_mutex_init(&context->env_mtx, NULL))
+ return EIO;
+
+ if (dev->driver_abi_ver > 3) {
+#ifdef MLX4_WQE_FORMAT
+ req.lib_caps = MLX4_USER_DEV_CAP_WQE_FORMAT;
+#endif
+ if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req),
+ &resp.ibv_resp, sizeof(resp)))
+ return errno;
+
+ VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp));
+ qp_tab_size = resp.qp_tab_size;
+ bf_reg_size = resp.bf_reg_size;
+ context->bf_regs_per_page = resp.bf_regs_per_page;
+ cqe_size = resp.cqe_size;
} else {
- if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
- &resp_v3.ibv_resp, sizeof resp_v3))
- goto err_free;
-
- context->num_qps = resp_v3.qp_tab_size;
- context->num_xrc_srqs = resp_v3.qp_tab_size;
- bf_reg_size = resp_v3.bf_reg_size;
- context->cqe_size = 32;
+ if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req.cmd),
+ &resp_v3.ibv_resp, sizeof(resp_v3)))
+ return errno;
+
+ VALGRIND_MAKE_MEM_DEFINED(&resp_v3, sizeof(resp_v3));
+ qp_tab_size = resp_v3.qp_tab_size;
+ bf_reg_size = resp_v3.bf_reg_size;
+ context->bf_regs_per_page = resp_v3.bf_regs_per_page;
+ cqe_size = 32;
}
+ context->num_qps = qp_tab_size;
context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS;
context->qp_table_mask = (1 << context->qp_table_shift) - 1;
+ context->cqe_size = cqe_size;
+ for (i = 0; i < MLX4_PORTS_NUM; ++i)
+ context->port_query_cache[i].valid = 0;
pthread_mutex_init(&context->qp_table_mutex, NULL);
for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
context->qp_table[i].refcnt = 0;
- context->xrc_srq_table_shift = ffs(context->num_xrc_srqs) - 1
- - MLX4_XRC_SRQ_TABLE_BITS;
- context->xrc_srq_table_mask = (1 << context->xrc_srq_table_shift) - 1;
-
- pthread_mutex_init(&context->xrc_srq_table_mutex, NULL);
- for (i = 0; i < MLX4_XRC_SRQ_TABLE_SIZE; ++i)
- context->xrc_srq_table[i].refcnt = 0;
-
for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
context->db_list[i] = NULL;
+ mlx4_init_xsrq_table(&context->xsrq_table, qp_tab_size);
pthread_mutex_init(&context->db_list_mutex, NULL);
- context->uar = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE,
+ context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
MAP_SHARED, cmd_fd, 0);
if (context->uar == MAP_FAILED)
- goto err_free;
+ return errno;
if (bf_reg_size) {
- context->bf_page = mmap(NULL, to_mdev(ibdev)->page_size,
- PROT_WRITE, MAP_SHARED, cmd_fd,
- to_mdev(ibdev)->page_size);
- if (context->bf_page == MAP_FAILED) {
+ context->bfs.page = mmap(NULL, dev->page_size,
+ PROT_WRITE, MAP_SHARED, cmd_fd,
+ dev->page_size);
+ if (context->bfs.page == MAP_FAILED) {
fprintf(stderr, PFX "Warning: BlueFlame available, "
"but failed to mmap() BlueFlame page.\n");
- context->bf_page = NULL;
- context->bf_buf_size = 0;
+ context->bfs.page = NULL;
+ context->bfs.buf_size = 0;
+ context->bfs.num_dedic_bfs = 0;
} else {
- context->bf_buf_size = bf_reg_size / 2;
- context->bf_offset = 0;
- pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
+ context->bfs.num_dedic_bfs = min(context->bf_regs_per_page - 1,
+ MLX4_MAX_BFS_IN_PAGE - 1);
+ context->bfs.buf_size = bf_reg_size / 2;
+ mlx4_spinlock_init(&context->bfs.dedic_bf_lock, !mlx4_single_threaded);
+ context->bfs.cmn_bf.address = context->bfs.page;
+
+ mlx4_lock_init(&context->bfs.cmn_bf.lock,
+ !mlx4_single_threaded,
+ mlx4_get_locktype());
+
+ context->bfs.dedic_bf_free = context->bfs.num_dedic_bfs;
+ for (i = 0; i < context->bfs.num_dedic_bfs; i++) {
+ context->bfs.dedic_bf[i].address = context->bfs.page + (i + 1) * MLX4_BFS_STRIDE;
+ context->bfs.dedic_bf_used[i] = 0;
+ }
}
} else {
- context->bf_page = NULL;
- context->bf_buf_size = 0;
+ context->bfs.page = NULL;
+ context->bfs.buf_size = 0;
+ context->bfs.num_dedic_bfs = 0;
}
- pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
+ mlx4_spinlock_init(&context->uar_lock, !mlx4_single_threaded);
- context->ibv_ctx.ops = mlx4_ctx_ops;
-#ifdef HAVE_IBV_XRC_OPS
- context->ibv_ctx.more_ops = &mlx4_more_ops;
-#endif
+ mlx4_spinlock_init(&context->send_db_lock, !mlx4_single_threaded);
+ INIT_LIST_HEAD(&context->send_db_list);
+
+ mlx4_spinlock_init(&context->hugetlb_lock, !mlx4_single_threaded);
+ INIT_LIST_HEAD(&context->hugetlb_list);
- if (mlx4_query_device(&context->ibv_ctx, &dev_attrs))
- goto query_free;
+ pthread_mutex_init(&context->task_mutex, NULL);
+
+ memset(&dev_attrs, 0, sizeof(dev_attrs));
+ dev_attrs.comp_mask = IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK |
+ IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK |
+ IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
+ IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN;
+
+ if (mlx4_exp_query_device(ibv_ctx, &dev_attrs)) {
+ if (mlx4_query_device(ibv_ctx, &dev_legacy_attrs))
+ goto query_free;
+
+ memcpy(&dev_attrs, &dev_legacy_attrs, sizeof(dev_legacy_attrs));
+ }
context->max_qp_wr = dev_attrs.max_qp_wr;
context->max_sge = dev_attrs.max_sge;
context->max_cqe = dev_attrs.max_cqe;
- if (!(dev_attrs.device_cap_flags & IBV_DEVICE_XRC)) {
- fprintf(stderr, PFX "There is a mismatch between "
- "the kernel and the userspace libraries: "
- "Kernel does not support XRC. Exiting.\n");
- goto query_free;
+ context->exp_device_cap_flags = dev_attrs.exp_device_cap_flags;
+ if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN)
+ context->max_ctx_res_domain = dev_attrs.max_ctx_res_domain;
+
+ VALGRIND_MAKE_MEM_DEFINED(&context->hca_core_clock, sizeof(context->hca_core_clock));
+ if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) {
+ if (dev_attrs.hca_core_clock)
+ context->core_clk.mult = ((1ull * 1000) << 29) /
+ dev_attrs.hca_core_clock;
+ else
+ context->core_clk.mult = 0;
+
+ context->core_clk.shift = 29;
+ context->core_clk.mask = dev_attrs.timestamp_mask;
+
+ if (ioctl(cmd_fd, MLX4_IOCHWCLOCKOFFSET,
+ &hca_clock_offset) >= 0) {
+ VALGRIND_MAKE_MEM_DEFINED(&hca_clock_offset, sizeof(hca_clock_offset));
+ context->core_clk.offset = hca_clock_offset;
+ hca_clock_page = mmap(NULL, hca_clock_offset +
+ sizeof(context->core_clk.mask),
+ PROT_READ, MAP_SHARED, cmd_fd,
+ dev->page_size *
+ (MLX4_IB_MMAP_GET_HW_CLOCK));
+
+ if (hca_clock_page == MAP_FAILED) {
+ fprintf(stderr, PFX
+ "Warning: Timestamp available,\n"
+ "but failed to mmap() hca core "
+ "clock page.\n");
+ } else {
+ context->hca_core_clock = hca_clock_page +
+ context->core_clk.offset;
+ }
+ }
}
- return &context->ibv_ctx;
+ ibv_ctx->ops = mlx4_ctx_ops;
+
+ verbs_ctx->has_comp_mask |= VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ |
+ VERBS_CONTEXT_QP;
+
+ verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd);
+ verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd);
+ verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex);
+ verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num);
+ verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex);
+ verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp);
+ verbs_set_ctx_op(verbs_ctx, create_flow, ibv_cmd_create_flow);
+ verbs_set_ctx_op(verbs_ctx, destroy_flow, ibv_cmd_destroy_flow);
+
+ /*
+ * Set experimental verbs
+ */
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_reg_shared_mr, mlx4_reg_shared_mr);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_flow, ibv_exp_cmd_create_flow);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_destroy_flow, ibv_exp_cmd_destroy_flow);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_ah, mlx4_exp_create_ah);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_device, mlx4_exp_query_device);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_create_qp, mlx4_exp_create_qp);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_qp, mlx4_exp_modify_qp);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_port, mlx4_exp_query_port);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_cq, mlx4_modify_cq);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_task, mlx4_post_task);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_set_legacy_xrc, mlx4_set_legacy_xrc);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_get_legacy_xrc, mlx4_get_legacy_xrc);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx4_exp_poll_cq);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_cq, mlx4_create_cq_ex);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_values, mlx4_query_values);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_reg_mr, mlx4_exp_reg_mr);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_send, mlx4_exp_post_send);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_bind_mw, mlx4_exp_bind_mw);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_rereg_mr, mlx4_exp_rereg_mr);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_dereg_mr, mlx4_exp_dereg_mr);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_res_domain, mlx4_exp_create_res_domain);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_res_domain, mlx4_exp_destroy_res_domain);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_query_intf, mlx4_exp_query_intf);
+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_release_intf, mlx4_exp_release_intf);
+
+ return 0;
query_free:
- munmap(context->uar, to_mdev(ibdev)->page_size);
- if (context->bf_page)
- munmap(context->bf_page, to_mdev(ibdev)->page_size);
-
-err_free:
- free(context);
- return NULL;
+ munmap(context->uar, dev->page_size);
+ if (context->bfs.page)
+ munmap(context->bfs.page, dev->page_size);
+ if (hca_clock_page)
+ munmap(hca_clock_page, hca_clock_offset +
+ sizeof(context->core_clk.mask));
+
+ return errno;
}
-static void mlx4_free_context(struct ibv_context *ibctx)
+static void mlx4_uninit_context(struct verbs_device *v_device,
+ struct ibv_context *ibv_ctx)
{
- struct mlx4_context *context = to_mctx(ibctx);
-
- munmap(context->uar, to_mdev(ibctx->device)->page_size);
- if (context->bf_page)
- munmap(context->bf_page, to_mdev(ibctx->device)->page_size);
- free(context);
+ struct mlx4_context *context = to_mctx(ibv_ctx);
+
+ munmap(context->uar, to_mdev(&v_device->device)->page_size);
+ if (context->bfs.page)
+ munmap(context->bfs.page,
+ to_mdev(&v_device->device)->page_size);
+ if (context->hca_core_clock)
+ munmap((context->hca_core_clock - context->core_clk.offset),
+ context->core_clk.offset + sizeof(context->core_clk.mask));
}
-static struct ibv_device_ops mlx4_dev_ops = {
- .alloc_context = mlx4_alloc_context,
- .free_context = mlx4_free_context
-};
-
-static struct ibv_device *mlx4_driver_init(const char *uverbs_sys_path,
- int abi_version)
+static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path,
+ int abi_version)
{
char value[8];
- struct mlx4_device *dev;
+ struct mlx4_device *dev;
unsigned vendor, device;
int i;
if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
value, sizeof value) < 0)
return NULL;
- sscanf(value, "%i", &vendor);
+ vendor = strtol(value, NULL, 16);
if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
value, sizeof value) < 0)
return NULL;
- sscanf(value, "%i", &device);
+ device = strtol(value, NULL, 16);
for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
if (vendor == hca_table[i].vendor &&
@@ -300,24 +737,32 @@
return NULL;
}
- dev = malloc(sizeof *dev);
+ dev = calloc(1, sizeof(*dev));
if (!dev) {
fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
uverbs_sys_path);
return NULL;
}
- dev->ibv_dev.ops = mlx4_dev_ops;
dev->page_size = sysconf(_SC_PAGESIZE);
+
+ dev->devid.id = device;
dev->driver_abi_ver = abi_version;
- return &dev->ibv_dev;
+ dev->verbs_dev.sz = sizeof(*dev);
+ dev->verbs_dev.size_of_context =
+ sizeof(struct mlx4_context) - sizeof(struct ibv_context);
+ /* mlx4_init_context will initialize provider calls */
+ dev->verbs_dev.init_context = mlx4_init_context;
+ dev->verbs_dev.uninit_context = mlx4_uninit_context;
+
+ return &dev->verbs_dev;
}
#ifdef HAVE_IBV_REGISTER_DRIVER
static __attribute__((constructor)) void mlx4_register_driver(void)
{
- ibv_register_driver("mlx4", mlx4_driver_init);
+ verbs_register_driver("mlx4", mlx4_driver_init);
}
#else
/*
Index: contrib/ofed/libmlx4/src/mlx4_exp.h
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/mlx4_exp.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_EXP_H
+#define MLX4_EXP_H
+
+#include <infiniband/kern-abi_exp.h>
+#include "mlx4.h"
+
+/*
+ * mlx4-abi experimental structs
+ */
+struct mlx4_exp_create_qp {
+ struct ibv_exp_create_qp ibv_cmd;
+ struct mlx4_exp_create_qp_provider exp_cmd;
+};
+
+struct mlx4_exp_create_cq {
+ struct ibv_exp_create_cq ibv_cmd;
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+/*
+ * Experimental functions
+ */
+struct ibv_qp *mlx4_exp_create_qp(struct ibv_context *context,
+ struct ibv_exp_qp_init_attr *attr);
+int mlx4_exp_query_device(struct ibv_context *context,
+ struct ibv_exp_device_attr *attr);
+int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num,
+ struct ibv_exp_port_attr *port_attr);
+int mlx4_exp_modify_cq(struct ibv_cq *cq, struct ibv_exp_cq_attr *attr,
+ int attr_mask);
+int mlx4_exp_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd,
+ void *addr, size_t length, uint64_t access,
+ struct ibv_exp_rereg_mr_attr *attr, struct ibv_exp_rereg_out *out);
+int mlx4_exp_dereg_mr(struct ibv_mr *mr, struct ibv_exp_dereg_out *out);
+struct ibv_exp_res_domain *mlx4_exp_create_res_domain(struct ibv_context *context,
+ struct ibv_exp_res_domain_init_attr *attr);
+int mlx4_exp_destroy_res_domain(struct ibv_context *context,
+ struct ibv_exp_res_domain *res_dom,
+ struct ibv_exp_destroy_res_domain_attr *attr);
+void *mlx4_exp_query_intf(struct ibv_context *context, struct ibv_exp_query_intf_params *params,
+ enum ibv_exp_query_intf_status *status);
+int mlx4_exp_release_intf(struct ibv_context *context, void *intf,
+ struct ibv_exp_release_intf_params *params);
+
+#endif /* MLX4_EXP_H */
Index: contrib/ofed/libmlx4/src/qp.c
===================================================================
--- contrib/ofed/libmlx4/src/qp.c
+++ contrib/ofed/libmlx4/src/qp.c
@@ -40,11 +40,40 @@
#include <netinet/in.h>
#include <pthread.h>
#include <string.h>
+#include <errno.h>
#include "mlx4.h"
#include "doorbell.h"
#include "wqe.h"
+#ifndef htobe64
+#include <endian.h>
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+# define htobe64(x) __bswap_64 (x)
+# else
+# define htobe64(x) (x)
+# endif
+#endif
+
+#ifdef MLX4_WQE_FORMAT
+ #define SET_BYTE_COUNT(byte_count) (htonl(byte_count) | owner_bit)
+ #define WQE_CTRL_OWN (1 << 30)
+#else
+ #define SET_BYTE_COUNT(byte_count) htonl(byte_count)
+ #define WQE_CTRL_OWN (1 << 31)
+#endif
+enum {
+ MLX4_OPCODE_BASIC = 0x00010000,
+ MLX4_OPCODE_MANAGED = 0x00020000,
+
+ MLX4_OPCODE_WITH_IMM = 0x01000000
+};
+
+#define MLX4_IB_OPCODE(op, class, attr) (((class) & 0x00FF0000) | ((attr) & 0xFF000000) | ((op) & 0x0000FFFF))
+#define MLX4_IB_OPCODE_GET_CLASS(opcode) ((opcode) & 0x00FF0000)
+#define MLX4_IB_OPCODE_GET_OP(opcode) ((opcode) & 0x0000FFFF)
+#define MLX4_IB_OPCODE_GET_ATTR(opcode) ((opcode) & 0xFF000000)
+
static const uint32_t mlx4_ib_opcode[] = {
[IBV_WR_SEND] = MLX4_OPCODE_SEND,
[IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM,
@@ -55,14 +84,151 @@
[IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA,
};
-static void *get_recv_wqe(struct mlx4_qp *qp, int n)
+
+static const uint32_t mlx4_ib_opcode_exp[] = {
+ [IBV_EXP_WR_SEND] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_SEND_WITH_IMM] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_IMM, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
+ [IBV_EXP_WR_RDMA_WRITE] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_RDMA_WRITE_WITH_IMM] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE_IMM, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
+ [IBV_EXP_WR_RDMA_READ] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_READ, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_ATOMIC_CMP_AND_SWP] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_CS, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_ATOMIC_FETCH_AND_ADD] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_FA, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_CS, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_FA, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_LOCAL_INV] = MLX4_IB_OPCODE(MLX4_OPCODE_LOCAL_INVAL, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_SEND_WITH_INV] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_INVAL, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
+ [IBV_EXP_WR_BIND_MW] = MLX4_IB_OPCODE(MLX4_OPCODE_BIND_MW, MLX4_OPCODE_BASIC, 0),
+ [IBV_EXP_WR_SEND_ENABLE] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_ENABLE, MLX4_OPCODE_MANAGED, 0),
+ [IBV_EXP_WR_RECV_ENABLE] = MLX4_IB_OPCODE(MLX4_OPCODE_RECV_ENABLE, MLX4_OPCODE_MANAGED, 0),
+ [IBV_EXP_WR_CQE_WAIT] = MLX4_IB_OPCODE(MLX4_OPCODE_CQE_WAIT, MLX4_OPCODE_MANAGED, 0),
+};
+
+enum {
+ MLX4_CALC_FLOAT64_ADD = 0x00,
+ MLX4_CALC_UINT64_ADD = 0x01,
+ MLX4_CALC_UINT64_MAXLOC = 0x02,
+ MLX4_CALC_UINT64_AND = 0x03,
+ MLX4_CALC_UINT64_XOR = 0x04,
+ MLX4_CALC_UINT64_OR = 0x05
+};
+
+enum {
+ MLX4_WQE_CTRL_CALC_OP = 26
+};
+
+static const struct mlx4_calc_op {
+ int valid;
+ uint32_t opcode;
+} mlx4_calc_ops_table
+ [IBV_EXP_CALC_DATA_SIZE_NUMBER]
+ [IBV_EXP_CALC_OP_NUMBER]
+ [IBV_EXP_CALC_DATA_TYPE_NUMBER] = {
+ [IBV_EXP_CALC_DATA_SIZE_64_BIT] = {
+ [IBV_EXP_CALC_OP_ADD] = {
+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_FLOAT64_ADD << MLX4_WQE_CTRL_CALC_OP }
+ },
+ [IBV_EXP_CALC_OP_BXOR] = {
+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP }
+ },
+ [IBV_EXP_CALC_OP_BAND] = {
+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP }
+ },
+ [IBV_EXP_CALC_OP_BOR] = {
+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP },
+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP }
+ },
+ [IBV_EXP_CALC_OP_MAXLOC] = {
+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
+ .valid = 1,
+ .opcode = MLX4_CALC_UINT64_MAXLOC << MLX4_WQE_CTRL_CALC_OP }
+ }
+ }
+};
+
+static int post_send_other(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_rc_raw_packet(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_ud(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_rc_uc(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_xrc(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+
+#define MLX4_WAIT_EN_VALID (1<<30)
+
+static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count) __attribute__((always_inline));
+static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count)
{
- return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
+ struct mlx4_wqe_wait_en_seg *seg = (struct mlx4_wqe_wait_en_seg *)wqe_seg;
+
+ seg->valid = htonl(MLX4_WAIT_EN_VALID);
+ seg->pi = htonl(count);
+ seg->obj_num = htonl(obj_num);
+
+ return;
}
-static void *get_send_wqe(struct mlx4_qp *qp, int n)
+static inline void *get_recv_wqe(struct mlx4_qp *qp, int n) __attribute__((always_inline));
+static inline void *get_recv_wqe(struct mlx4_qp *qp, int n)
+{
+ return qp->rq.buf + (n << qp->rq.wqe_shift);
+}
+
+void *mlx4_get_recv_wqe(struct mlx4_qp *qp, int n)
+{
+ return get_recv_wqe(qp, n);
+}
+
+static void *get_send_wqe64(struct mlx4_qp *qp, unsigned int n)
+{
+ return qp->sq.buf + (n << 6);
+}
+static void *get_send_wqe(struct mlx4_qp *qp, unsigned int n)
{
- return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
+ return qp->sq.buf + (n << qp->sq.wqe_shift);
}
/*
@@ -70,7 +236,48 @@
* first four bytes of every 64 byte chunk with 0xffffffff, except for
* the very first chunk of the WQE.
*/
-static void stamp_send_wqe(struct mlx4_qp *qp, int n)
+void mlx4_init_qp_indices(struct mlx4_qp *qp)
+{
+ qp->sq.head = 0;
+ qp->sq.tail = 0;
+ qp->rq.head = 0;
+ qp->rq.tail = 0;
+ qp->sq.head_en_index = 0;
+ qp->sq.head_en_count = 0;
+ qp->rq.head_en_index = 0;
+ qp->rq.head_en_count = 0;
+}
+
+#ifdef MLX4_WQE_FORMAT
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
+{
+ __be32 *wqe = get_send_wqe(qp, 0);
+ int wq_size = (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ int i;
+
+ for (i = 0; i < wq_size; i += 64)
+ wqe[i / 4] = htonl(WQE_CTRL_OWN);
+}
+
+static void set_owner_wqe(struct mlx4_qp *qp, unsigned int idx, int ds,
+ uint32_t owner_bit)
+{
+ uint32_t *wqe;
+ int max_sz = (1 << qp->sq.wqe_shift) / 4;
+ int cur_sz = ds * 4;
+ int tail_sz;
+ int i;
+
+ if (max_sz - cur_sz < 16)
+ return;
+
+ wqe = get_send_wqe(qp, idx & (qp->sq.wqe_cnt - 1));
+ tail_sz = max_sz - cur_sz;
+ for (i = 0; tail_sz > 16; i += 4, tail_sz -= 16)
+ wqe[cur_sz + i * 4] = owner_bit;
+}
+#else
+static void stamp_send_wqe(struct mlx4_qp *qp, unsigned int n)
{
uint32_t *wqe = get_send_wqe(qp, n);
int i;
@@ -80,14 +287,6 @@
wqe[i] = 0xffffffff;
}
-void mlx4_init_qp_indices(struct mlx4_qp *qp)
-{
- qp->sq.head = 0;
- qp->sq.tail = 0;
- qp->rq.head = 0;
- qp->rq.tail = 0;
-}
-
void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
{
struct mlx4_wqe_ctrl_seg *ctrl;
@@ -95,29 +294,78 @@
for (i = 0; i < qp->sq.wqe_cnt; ++i) {
ctrl = get_send_wqe(qp, i);
- ctrl->owner_opcode = htonl(1 << 31);
+ ctrl->owner_opcode = htonl(WQE_CTRL_OWN);
ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
stamp_send_wqe(qp, i);
}
}
+#endif
-static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
+static int __wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) __attribute__((noinline));
+static int __wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp)
{
+ struct mlx4_cq *cq = to_mcq(qp->verbs_qp.qp.send_cq);
unsigned cur;
+ mlx4_lock(&cq->lock);
cur = wq->head - wq->tail;
- if (cur + nreq < wq->max_post)
- return 0;
+ mlx4_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max_post;
+}
+
+static inline int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) __attribute__((always_inline));
+static inline int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp)
+{
+ unsigned cur;
- pthread_spin_lock(&cq->lock);
cur = wq->head - wq->tail;
- pthread_spin_unlock(&cq->lock);
+ if (likely(cur + nreq < wq->max_post))
+ return 0;
- return cur + nreq >= wq->max_post;
+ return __wq_overflow(wq, nreq, qp);
+}
+
+static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_exp_send_wr *wr)
+{
+ uint64_t acc = wr->bind_mw.bind_info.exp_mw_access_flags;
+ bseg->flags1 = 0;
+ if (acc & IBV_EXP_ACCESS_REMOTE_ATOMIC)
+ bseg->flags1 |= htonl(MLX4_WQE_MW_ATOMIC);
+ if (acc & IBV_EXP_ACCESS_REMOTE_WRITE)
+ bseg->flags1 |= htonl(MLX4_WQE_MW_REMOTE_WRITE);
+ if (acc & IBV_EXP_ACCESS_REMOTE_READ)
+ bseg->flags1 |= htonl(MLX4_WQE_MW_REMOTE_READ);
+
+ bseg->flags2 = 0;
+ if (((struct verbs_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
+ bseg->flags2 |= htonl(MLX4_WQE_BIND_TYPE_2);
+ if (acc & IBV_EXP_ACCESS_MW_ZERO_BASED)
+ bseg->flags2 |= htonl(MLX4_WQE_BIND_ZERO_BASED);
+
+ bseg->new_rkey = htonl(wr->bind_mw.rkey);
+ bseg->lkey = htonl(wr->bind_mw.bind_info.mr->lkey);
+ bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
+ bseg->length = htobe64(wr->bind_mw.bind_info.length);
+}
+
+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
+ uint32_t rkey) __attribute__((always_inline));
+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
+ uint32_t rkey)
+{
+ iseg->mem_key = htonl(rkey);
+
+ iseg->reserved1 = 0;
+ iseg->reserved2 = 0;
+ iseg->reserved3[0] = 0;
+ iseg->reserved3[1] = 0;
}
static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+ uint64_t remote_addr, uint32_t rkey) __attribute__((always_inline));
+static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
uint64_t remote_addr, uint32_t rkey)
{
rseg->raddr = htonll(remote_addr);
@@ -125,16 +373,33 @@
rseg->reserved = 0;
}
-static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
+ struct ibv_exp_send_wr *wr)
{
- if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
+ struct ibv_exp_fetch_add *fa;
+
+ if (wr->exp_opcode == IBV_EXP_WR_ATOMIC_CMP_AND_SWP) {
aseg->swap_add = htonll(wr->wr.atomic.swap);
aseg->compare = htonll(wr->wr.atomic.compare_add);
+ } else if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) {
+ fa = &wr->ext_op.masked_atomics.wr_data.inline_data.op.fetch_add;
+ aseg->swap_add = htonll(fa->add_val);
+ aseg->compare = htonll(fa->field_boundary);
} else {
aseg->swap_add = htonll(wr->wr.atomic.compare_add);
aseg->compare = 0;
}
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+ struct ibv_exp_send_wr *wr)
+{
+ struct ibv_exp_cmp_swap *cs = &wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap;
+ aseg->swap_data = htonll(cs->swap_val);
+ aseg->cmp_data = htonll(cs->compare_val);
+ aseg->swap_mask = htonll(cs->swap_mask);
+ aseg->cmp_mask = htonll(cs->compare_mask);
}
static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
@@ -147,14 +412,18 @@
memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
}
-static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+static inline void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) __attribute__((always_inline));
+static inline void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
{
dseg->byte_count = htonl(sg->length);
dseg->lkey = htonl(sg->lkey);
dseg->addr = htonll(sg->addr);
}
-static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg,
+ struct ibv_sge *sg, unsigned int owner_bit) __attribute__((always_inline));
+static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg,
+ struct ibv_sge *sg, unsigned int owner_bit)
{
dseg->lkey = htonl(sg->lkey);
dseg->addr = htonll(sg->addr);
@@ -169,7 +438,10 @@
*/
wmb();
- dseg->byte_count = htonl(sg->length);
+ if (likely(sg->length))
+ dseg->byte_count = SET_BYTE_COUNT(sg->length);
+ else
+ dseg->byte_count = htonl(0x80000000);
}
/*
@@ -177,84 +449,787 @@
* implementations may use move-string-buffer assembler instructions,
* which do not guarantee order of copying.
*/
-static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+#if defined(__amd64__)
+#define COPY_64B_WC(dst, src) \
+ __asm__ __volatile__ ( \
+ " movdqa (%1),%%xmm0\n" \
+ " movdqa 16(%1),%%xmm1\n" \
+ " movdqa 32(%1),%%xmm2\n" \
+ " movdqa 48(%1),%%xmm3\n" \
+ " movntdq %%xmm0, (%0)\n" \
+ " movntdq %%xmm1, 16(%0)\n" \
+ " movntdq %%xmm2, 32(%0)\n" \
+ " movntdq %%xmm3, 48(%0)\n" \
+ : : "r" (dst), "r" (src) : "memory"); \
+ dst += 8; \
+ src += 8
+#else
+#define COPY_64B_WC(dst, src) \
+ *dst++ = *src++; \
+ *dst++ = *src++; \
+ *dst++ = *src++; \
+ *dst++ = *src++; \
+ *dst++ = *src++; \
+ *dst++ = *src++; \
+ *dst++ = *src++; \
+ *dst++ = *src++
+#endif
+
+static void mlx4_bf_copy(uint64_t *dst, uint64_t *src, unsigned bytecnt)
{
while (bytecnt > 0) {
- *dst++ = *src++;
- *dst++ = *src++;
- bytecnt -= 2 * sizeof (long);
+ COPY_64B_WC(dst, src);
+ bytecnt -= 8 * sizeof(uint64_t);
+ }
+}
+
+/* Convert WQE format to fit BF usage */
+static inline void convert_to_bf_wqe(struct mlx4_qp *qp,
+ struct mlx4_wqe_ctrl_seg *ctrl,
+ const unsigned wqe_idx) __attribute__((always_inline));
+static inline void convert_to_bf_wqe(struct mlx4_qp *qp,
+ struct mlx4_wqe_ctrl_seg *ctrl,
+ const unsigned wqe_idx)
+{
+ uint32_t *tmp = (uint32_t *)ctrl->reserved;
+
+ ctrl->owner_opcode |= htonl((wqe_idx & 0xffff) << 8);
+ *tmp |= qp->doorbell_qpn;
+}
+
+static inline void copy_wqe_to_bf(struct mlx4_qp *qp,
+ struct mlx4_wqe_ctrl_seg *ctrl,
+ const int aligned_size,
+ const unsigned wqe_idx,
+ const int dedic_bf,
+ const int one_thread_auto_evict) __attribute__((always_inline));
+static inline void copy_wqe_to_bf(struct mlx4_qp *qp,
+ struct mlx4_wqe_ctrl_seg *ctrl,
+ const int aligned_size,
+ const unsigned wqe_idx,
+ const int dedic_bf,
+ const int one_thread_auto_evict)
+{
+ convert_to_bf_wqe(qp, ctrl, wqe_idx);
+
+ if (dedic_bf && one_thread_auto_evict)
+ /*
+ * In case QP has dedicated BF, only one thread using this QP
+ * and the CPU arch supports auto eviction of WC buffer we can move
+ * the wc_wmb before the bf_copy (usually it is located after the bf_copy).
+ * This provides significant improvement in message rate of small messages.
+ * This barrier keeps BF toggling order by ensuring that previous BF data
+ * is written to memory before writing to the next BF buffer.
+ */
+ wc_wmb();
+ else
+ /*
+ * Make sure that descriptor is written to memory
+ * before writing to BlueFlame page.
+ */
+ wmb();
+
+ if (dedic_bf) {
+ mlx4_bf_copy(qp->bf->dedic.address, (uint64_t *) ctrl, aligned_size);
+ } else {
+ mlx4_lock(&qp->bf->cmn.lock);
+ mlx4_bf_copy(qp->bf->cmn.address, (uint64_t *) ctrl, aligned_size);
+ }
+ if (!(dedic_bf && one_thread_auto_evict))
+ /*
+ * This barrier ensures that BF data is written to memory
+ * before toggling the BF buffer. This is to keep the right
+ * toggling order and to prevent the case in which next BF data
+ * will be written before the current BF data.
+ * In addition this barrier ensures the eviction of the WC buffer.
+ * See comment above for the conditions in which this barrier may be
+ * set before the bf_copy.
+ */
+ wc_wmb();
+
+ if (dedic_bf) {
+ /* Toggle BF buffer */
+ qp->bf->dedic.address = (void *)((uintptr_t)qp->bf->dedic.address ^ qp->bf_buf_size);
+ } else {
+ /* Toggle BF buffer */
+ qp->bf->cmn.address = (void *)((uintptr_t)qp->bf->cmn.address ^ qp->bf_buf_size);
+ mlx4_unlock(&qp->bf->cmn.lock);
+ }
+}
+
+static inline void __ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+ int nreq, int size, int inl,
+ const int use_bf, const int dedic_bf, const int one_thread_auto_evict,
+ const int prefer_bf) __attribute__((always_inline));
+static inline void __ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+ int nreq, int size, int inl,
+ const int use_bf, const int dedic_bf, const int one_thread_auto_evict,
+ const int prefer_bf)
+{
+ if (use_bf && nreq == 1 && (inl || prefer_bf) &&
+ size > 1 && size <= qp->bf_buf_size / 16) {
+ copy_wqe_to_bf(qp, ctrl, align(size * 16, 64),
+ qp->sq.head , dedic_bf,
+ one_thread_auto_evict);
+ ++qp->sq.head;
+ } else if (likely(nreq)) {
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * ringing non-cached doorbell record.
+ */
+ nc_wmb();
+ *qp->sdb = qp->doorbell_qpn;
+ }
+}
+
+static void __ring_db_mng(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+ int nreq, int size, int inl) __attribute__((noinline));
+static void __ring_db_mng(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+ int nreq, int size, int inl)
+{
+ struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context);
+
+ if (nreq == 1 && (inl || ctx->prefer_bf) && size > 1 && size <= qp->bf_buf_size / 16) {
+ convert_to_bf_wqe(qp, ctrl, qp->sq.head);
+
+ /*
+ * Make sure that descriptor is written to memory
+ * before writing to BlueFlame page.
+ */
+ wmb();
+
+ ++qp->sq.head;
+
+ wmb();
+
+ } else if (likely(nreq)) {
+ qp->sq.head += nreq;
+
+ /* Controlled qp */
+ wmb();
+ }
+}
+
+static inline void ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+ int nreq, int size, int inl) __attribute__((always_inline));
+static inline void ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+ int nreq, int size, int inl)
+{
+ if (unlikely(qp->create_flags & IBV_EXP_QP_CREATE_MANAGED_SEND))
+ return __ring_db_mng(qp, ctrl, nreq, size, inl);
+
+ switch (qp->db_method) {
+ case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB:
+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 1);
+ case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB:
+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 0);
+ case MLX4_QP_DB_METHOD_DEDIC_BF:
+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
+ case MLX4_QP_DB_METHOD_BF:
+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
+ case MLX4_QP_DB_METHOD_DB:
+ return __ring_db(qp, ctrl, nreq, size, inl, 0, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
+ }
+}
+
+static void set_ctrl_seg(struct mlx4_wqe_ctrl_seg *ctrl, struct ibv_send_wr *wr,
+ struct mlx4_qp *qp, uint32_t imm, uint32_t srcrb_flags,
+ unsigned int owner_bit, int size, uint32_t wr_op)
+{
+ ctrl->srcrb_flags = srcrb_flags;
+ ctrl->imm = imm;
+ ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
+ MLX4_WQE_CTRL_FENCE : 0) | size;
+
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ wmb();
+ ctrl->owner_opcode = htonl(wr_op) | owner_bit;
+}
+
+static inline int set_data_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+ void *wqe, int *size, unsigned int owner_bit) __attribute__((always_inline));
+static inline int set_data_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+ void *wqe, int *size, unsigned int owner_bit)
+{
+ struct mlx4_wqe_inline_seg *seg;
+ void *addr;
+ int len, seg_len;
+ int num_seg;
+ int off, to_copy;
+ int i;
+ int inl = 0;
+
+ seg = wqe;
+ wqe += sizeof(*seg);
+ off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
+ num_seg = 0;
+ seg_len = 0;
+
+ for (i = 0; i < num_sge; ++i) {
+ addr = (void *) (uintptr_t) sg_list[i].addr;
+ len = sg_list[i].length;
+ inl += len;
+
+ if (unlikely(inl > qp->max_inline_data))
+ return ENOMEM;
+
+ while (len >= MLX4_INLINE_ALIGN - off) {
+ to_copy = MLX4_INLINE_ALIGN - off;
+ memcpy(wqe, addr, to_copy);
+ len -= to_copy;
+ wqe += to_copy;
+ addr += to_copy;
+ seg_len += to_copy;
+ wmb(); /* see comment below */
+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_len));
+ seg_len = 0;
+ seg = wqe;
+ wqe += sizeof(*seg);
+ off = sizeof(*seg);
+ ++num_seg;
+ }
+
+ memcpy(wqe, addr, len);
+ wqe += len;
+ seg_len += len;
+ off += len;
+ }
+
+ if (likely(seg_len)) {
+ ++num_seg;
+ /*
+ * Need a barrier here to make sure
+ * all the data is visible before the
+ * byte_count field is set. Otherwise
+ * the HCA prefetcher could grab the
+ * 64-byte chunk with this inline
+ * segment and get a valid (!=
+ * 0xffffffff) byte count but stale
+ * data, and end up sending the wrong
+ * data.
+ */
+ wmb();
+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_len));
+ }
+
+ *size += (inl + num_seg * sizeof(*seg) + 15) / 16;
+
+ return 0;
+}
+
+static inline void set_data_inl_seg_fast(struct mlx4_qp *qp,
+ void *addr, int length,
+ void *wqe, int *size,
+ unsigned int owner_bit) __attribute__((always_inline));
+static inline void set_data_inl_seg_fast(struct mlx4_qp *qp,
+ void *addr, int length,
+ void *wqe, int *size,
+ unsigned int owner_bit)
+{
+ struct mlx4_wqe_inline_seg *seg;
+ static const int first_seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg) - sizeof(struct mlx4_wqe_ctrl_seg);
+ static const int seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg);
+
+ seg = wqe;
+ wqe += sizeof(*seg);
+
+ if (length <= first_seg_data_size) {
+ /* For the first segment there is no need to make sure
+ * all the data is visible before the byte_count field is set.
+ * This is because the ctrl segment at the beginning of the
+ * segment covers HCA prefetcher issue.
+ */
+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | length));
+
+ memcpy(wqe, addr, length);
+ *size += (length + sizeof(*seg) + 15) / 16;
+ } else {
+ void *start_wqe = seg;
+
+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | first_seg_data_size));
+ memcpy(wqe, addr, first_seg_data_size);
+ length -= first_seg_data_size;
+ addr += first_seg_data_size;
+ seg = (struct mlx4_wqe_inline_seg *)((char *)seg + MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg));
+ wqe += MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg);
+
+ while (length > seg_data_size) {
+ memcpy(wqe, addr, seg_data_size);
+ wmb(); /* see comment below */
+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_data_size));
+ length -= seg_data_size ;
+ addr += seg_data_size;
+ seg = (struct mlx4_wqe_inline_seg *)((char *)seg + MLX4_INLINE_ALIGN);
+ wqe += MLX4_INLINE_ALIGN;
+ }
+ memcpy(wqe, addr, length);
+
+ /*
+ * Need a barrier here to make sure
+ * all the data is visible before the
+ * byte_count field is set. Otherwise
+ * the HCA prefetcher could grab the
+ * 64-byte chunk with this inline
+ * segment and get a valid (!=
+ * 0xffffffff) byte count but stale
+ * data, and end up sending the wrong
+ * data.
+ */
+ wmb();
+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | length));
+ *size += (wqe + length - start_wqe + 15) / 16;
+ }
+}
+
+static inline void set_data_non_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+ void *wqe, int *size, unsigned int owner_bit) __attribute__((always_inline));
+static inline void set_data_non_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+ void *wqe, int *size, unsigned int owner_bit)
+{
+ if (likely(num_sge == 1)) {
+ struct mlx4_wqe_data_seg *seg = wqe;
+
+ set_ptr_data(seg, sg_list, owner_bit);
+
+ *size += (sizeof(*seg) / 16);
+ } else {
+ struct mlx4_wqe_data_seg *seg = wqe;
+ int i;
+
+ for (i = num_sge - 1; i >= 0 ; --i)
+ set_ptr_data(seg + i, sg_list + i, owner_bit);
+
+ *size += num_sge * (sizeof(*seg) / 16);
+ }
+}
+
+static inline int set_data_seg(struct mlx4_qp *qp, void *seg, int *sz, int is_inl,
+ int num_sge, struct ibv_sge *sg_list, int *inl,
+ unsigned int owner_bit) __attribute__((always_inline));
+static inline int set_data_seg(struct mlx4_qp *qp, void *seg, int *sz, int is_inl,
+ int num_sge, struct ibv_sge *sg_list, int *inl,
+ unsigned int owner_bit)
+{
+ if (is_inl) {
+ /* inl is set to true if this is an inline data segment and num_sge > 0 */
+ *inl = num_sge > 0;
+ return set_data_inl_seg(qp, num_sge, sg_list, seg, sz,
+ owner_bit);
+ }
+ set_data_non_inl_seg(qp, num_sge, sg_list, seg, sz, owner_bit);
+
+ return 0;
+}
+
+static inline int set_common_segments(struct ibv_send_wr *wr, struct mlx4_qp *qp,
+ uint32_t srcrb_flags, uint32_t imm,
+ void *wqe, void *ctrl, int size, int *total_size,
+ int *inl, unsigned int ind) __attribute__((always_inline));
+static inline int set_common_segments(struct ibv_send_wr *wr, struct mlx4_qp *qp,
+ uint32_t srcrb_flags, uint32_t imm,
+ void *wqe, void *ctrl, int size, int *total_size,
+ int *inl, unsigned int ind)
+{
+ int ret;
+ unsigned int owner_bit = (ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0;
+
+ ret = set_data_seg(qp, wqe, &size, !!(wr->send_flags & IBV_SEND_INLINE),
+ wr->num_sge, wr->sg_list, inl, owner_bit);
+ if (unlikely(ret))
+ return ret;
+
+ *total_size = size;
+ set_ctrl_seg(ctrl, wr, qp, imm, srcrb_flags, owner_bit, size,
+ mlx4_ib_opcode[wr->opcode]);
+
+ return 0;
+
+}
+
+static int post_send_other(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind)
+{
+ void *ctrl = wqe_add;
+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+ int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+ uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+ uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+ ? wr->imm_data : 0;
+
+ return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+
+}
+
+static int post_send_rc_raw_packet(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind)
+{
+ void *ctrl = wqe_add;
+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+ union {
+ uint32_t srcrb_flags;
+ uint16_t srcrb_flags16[2];
+ } u;
+ uint32_t imm;
+ int idx;
+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+
+ /* Sanity check - prevent from posting empty SR */
+ if (unlikely(!wr->num_sge))
+ return EINVAL;
+
+ if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) {
+ /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+ * to indicate that no icrc should be calculated */
+ idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED;
+ u.srcrb_flags = htonl((uint32_t)(qp->srcrb_flags_tbl[idx] | MLX4_WQE_CTRL_SOLICIT));
+ /* For raw eth, take the dmac from the payload */
+ u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)wr->sg_list[0].addr;
+ imm = *(uint32_t *)((uintptr_t)(wr->sg_list[0].addr)+2);
+ } else {
+ idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+ u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+
+ imm = (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+ ? wr->imm_data : 0;
+ }
+
+ return set_common_segments(wr, qp, u.srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+}
+
+static int post_send_ud(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind)
+{
+ void *ctrl = wqe_add;
+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+ int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+ uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+ uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+ ? wr->imm_data : 0;
+
+ set_datagram_seg(wqe, wr);
+ wqe += sizeof(struct mlx4_wqe_datagram_seg);
+ size += sizeof(struct mlx4_wqe_datagram_seg) / 16;
+
+ return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+}
+
+static inline int post_send_connected(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind, int is_xrc) __attribute__((always_inline));
+static inline int post_send_connected(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind, int is_xrc)
+{
+ void *ctrl = wqe_add;
+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+ uint32_t srcrb_flags;
+ uint32_t imm = 0;
+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+ int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+
+ if (is_xrc)
+ srcrb_flags = htonl((wr->qp_type.xrc.remote_srqn << 8) |
+ (qp->srcrb_flags_tbl[idx]));
+ else
+ srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+
+ switch (wr->opcode) {
+ case IBV_WR_ATOMIC_CMP_AND_SWP:
+ case IBV_WR_ATOMIC_FETCH_AND_ADD:
+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+ wr->wr.atomic.rkey);
+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
+
+ set_atomic_seg(wqe, (struct ibv_exp_send_wr *)wr);
+ wqe += sizeof(struct mlx4_wqe_atomic_seg);
+ size += (sizeof(struct mlx4_wqe_raddr_seg) +
+ sizeof(struct mlx4_wqe_atomic_seg)) / 16;
+
+ break;
+
+ case IBV_WR_SEND_WITH_IMM:
+ imm = wr->imm_data;
+ break;
+
+ case IBV_WR_RDMA_WRITE_WITH_IMM:
+ imm = wr->imm_data;
+ if (!wr->num_sge)
+ *inl = 1;
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
+ size += sizeof(struct mlx4_wqe_raddr_seg) / 16;
+ break;
+
+ case IBV_WR_RDMA_READ:
+ *inl = 1;
+ /* fall through */
+ case IBV_WR_RDMA_WRITE:
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
+ size += sizeof(struct mlx4_wqe_raddr_seg) / 16;
+
+ break;
+
+ case IBV_WR_SEND:
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+
+ return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+}
+
+static int post_send_rc_uc(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind)
+{
+ return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 0);
+}
+
+static int post_send_xrc(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe_add, int *total_size,
+ int *inl, unsigned int ind)
+{
+ return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 1);
+}
+
+void mlx4_update_post_send_one(struct mlx4_qp *qp)
+{
+ switch (qp->qp_type) {
+ case IBV_QPT_XRC_SEND:
+ case IBV_QPT_XRC:
+ qp->post_send_one = post_send_xrc;
+ break;
+ case IBV_QPT_RC:
+ case IBV_QPT_UC:
+ qp->post_send_one = post_send_rc_uc;
+ break;
+ case IBV_QPT_UD:
+ qp->post_send_one = post_send_ud;
+ break;
+
+ case IBV_QPT_RAW_PACKET:
+ qp->post_send_one = post_send_rc_raw_packet;
+ break;
+
+ default:
+ qp->post_send_one = post_send_other;
+ break;
}
}
int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
- struct ibv_send_wr **bad_wr)
+ struct ibv_send_wr **bad_wr)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ void *uninitialized_var(ctrl);
+ unsigned int ind;
+ int nreq;
+ int inl = 0;
+ int ret = 0;
+ int size = 0;
+
+ mlx4_lock(&qp->sq.lock);
+
+ /* XXX check that state is OK to post send */
+
+ ind = qp->sq.head;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ /* to be considered whether can throw first check, create_qp_exp with post_send */
+ if (!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW))
+ if (unlikely(wq_overflow(&qp->sq, nreq, qp))) {
+ ret = ENOMEM;
+ errno = ret;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+ ret = ENOMEM;
+ errno = ret;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->opcode >= sizeof(mlx4_ib_opcode) / sizeof(mlx4_ib_opcode[0]))) {
+ ret = EINVAL;
+ errno = ret;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ctrl = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+ ret = qp->post_send_one(wr, qp, ctrl, &size, &inl, ind);
+ if (unlikely(ret)) {
+ inl = 0;
+ errno = ret;
+ *bad_wr = wr;
+ goto out;
+ }
+ /*
+ * We can improve latency by not stamping the last
+ * send queue WQE until after ringing the doorbell, so
+ * only stamp here if there are still more WQEs to post.
+ */
+ if (likely(wr->next))
+#ifndef MLX4_WQE_FORMAT
+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+ (qp->sq.wqe_cnt - 1));
+#else
+ /* Make sure all owners bits are set to HW ownership */
+ set_owner_wqe(qp, ind, size,
+ ((ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0));
+#endif
+
+ ++ind;
+ }
+
+out:
+ ring_db(qp, ctrl, nreq, size, inl);
+
+ if (likely(nreq))
+#ifndef MLX4_WQE_FORMAT
+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+ (qp->sq.wqe_cnt - 1));
+#else
+ set_owner_wqe(qp, ind - 1, size,
+ ((ind - 1) & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0));
+#endif
+ mlx4_unlock(&qp->sq.lock);
+
+ return ret;
+}
+
+int mlx4_exp_post_send(struct ibv_qp *ibqp, struct ibv_exp_send_wr *wr,
+ struct ibv_exp_send_wr **bad_wr)
{
- struct mlx4_context *ctx;
struct mlx4_qp *qp = to_mqp(ibqp);
void *wqe;
- struct mlx4_wqe_ctrl_seg *ctrl;
- int ind;
+ void *uninitialized_var(ctrl);
+ union {
+ uint32_t srcrb_flags;
+ uint16_t srcrb_flags16[2];
+ } u;
+ uint32_t imm;
+ int idx;
+ unsigned int ind;
+ int uninitialized_var(owner_bit);
int nreq;
int inl = 0;
int ret = 0;
- int size;
- int i;
+ int size = 0;
+ uint32_t mlx4_wr_op;
+ uint64_t exp_send_flags;
- pthread_spin_lock(&qp->sq.lock);
+ mlx4_lock(&qp->sq.lock);
/* XXX check that state is OK to post send */
ind = qp->sq.head;
for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
- ret = -1;
+ exp_send_flags = wr->exp_send_flags;
+
+ if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW) &&
+ wq_overflow(&qp->sq, nreq, qp))) {
+ ret = ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+ ret = ENOMEM;
*bad_wr = wr;
goto out;
}
- if (wr->num_sge > qp->sq.max_gs) {
- ret = -1;
+ if (unlikely(wr->exp_opcode >= sizeof(mlx4_ib_opcode_exp) / sizeof(mlx4_ib_opcode_exp[0]))) {
+ ret = EINVAL;
*bad_wr = wr;
goto out;
}
- if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
- ret = -1;
+ if (((MLX4_IB_OPCODE_GET_CLASS(mlx4_ib_opcode_exp[wr->exp_opcode]) == MLX4_OPCODE_MANAGED) ||
+ (exp_send_flags & IBV_EXP_SEND_WITH_CALC)) &&
+ !(qp->create_flags & IBV_EXP_QP_CREATE_CROSS_CHANNEL)) {
+ ret = EINVAL;
*bad_wr = wr;
goto out;
}
+ mlx4_wr_op = MLX4_IB_OPCODE_GET_OP(mlx4_ib_opcode_exp[wr->exp_opcode]);
+
ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+ owner_bit = ind & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0;
- ctrl->xrcrb_flags =
- (wr->send_flags & IBV_SEND_SIGNALED ?
- htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
- (wr->send_flags & IBV_SEND_SOLICITED ?
- htonl(MLX4_WQE_CTRL_SOLICIT) : 0) |
- qp->sq_signal_bits;
+ idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED |
+ (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) |
+ (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2);
+ u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
- if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
- wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
- ctrl->imm = wr->imm_data;
- else
- ctrl->imm = 0;
+ imm = (MLX4_IB_OPCODE_GET_ATTR(mlx4_ib_opcode_exp[wr->exp_opcode]) & MLX4_OPCODE_WITH_IMM ?
+ wr->ex.imm_data : 0);
- wqe += sizeof *ctrl;
- size = sizeof *ctrl / 16;
+ wqe += sizeof(struct mlx4_wqe_ctrl_seg);
+ size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
- switch (ibqp->qp_type) {
+ switch (qp->qp_type) {
+ case IBV_QPT_XRC_SEND:
case IBV_QPT_XRC:
- ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
- /* fall thru */
+ u.srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
+ /* fall through */
case IBV_QPT_RC:
case IBV_QPT_UC:
- switch (wr->opcode) {
- case IBV_WR_ATOMIC_CMP_AND_SWP:
- case IBV_WR_ATOMIC_FETCH_AND_ADD:
- set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
- wr->wr.atomic.rkey);
+ switch (wr->exp_opcode) {
+ case IBV_EXP_WR_ATOMIC_CMP_AND_SWP:
+ case IBV_EXP_WR_ATOMIC_FETCH_AND_ADD:
+ case IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD:
+ if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) {
+ if (!qp->is_masked_atomic) {
+ ret = EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+ set_raddr_seg(wqe,
+ wr->ext_op.masked_atomics.remote_addr,
+ wr->ext_op.masked_atomics.rkey);
+ } else {
+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+ wr->wr.atomic.rkey);
+ }
wqe += sizeof (struct mlx4_wqe_raddr_seg);
set_atomic_seg(wqe, wr);
@@ -264,184 +1239,259 @@
break;
- case IBV_WR_RDMA_READ:
+ case IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP:
+ if (!qp->is_masked_atomic) {
+ ret = EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+ set_raddr_seg(wqe,
+ wr->ext_op.masked_atomics.remote_addr,
+ wr->ext_op.masked_atomics.rkey);
+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
+
+ set_masked_atomic_seg(wqe, wr);
+ wqe += sizeof(struct mlx4_wqe_masked_atomic_seg);
+ size += (sizeof(struct mlx4_wqe_raddr_seg) +
+ sizeof(struct mlx4_wqe_masked_atomic_seg)) / 16;
+ break;
+
+ case IBV_EXP_WR_RDMA_READ:
inl = 1;
/* fall through */
- case IBV_WR_RDMA_WRITE:
- case IBV_WR_RDMA_WRITE_WITH_IMM:
- set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
- wr->wr.rdma.rkey);
+ case IBV_EXP_WR_RDMA_WRITE_WITH_IMM:
+ if (!wr->num_sge)
+ inl = 1;
+ /* fall through */
+ case IBV_EXP_WR_RDMA_WRITE:
+ if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) {
+
+ if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER ||
+ (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER ||
+ (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER ||
+ !mlx4_calc_ops_table
+ [wr->op.calc.data_size]
+ [wr->op.calc.calc_op]
+ [wr->op.calc.data_type].valid) {
+ ret = -1;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ mlx4_wr_op = MLX4_OPCODE_CALC_RDMA_WRITE_IMM |
+ mlx4_calc_ops_table
+ [wr->op.calc.data_size]
+ [wr->op.calc.calc_op]
+ [wr->op.calc.data_type].opcode;
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+
+ } else {
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ }
wqe += sizeof (struct mlx4_wqe_raddr_seg);
size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
break;
- default:
- /* No extra segments required for sends */
+ case IBV_EXP_WR_LOCAL_INV:
+ u.srcrb_flags |= htonl(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+ wqe += sizeof
+ (struct mlx4_wqe_local_inval_seg);
+ size += sizeof
+ (struct mlx4_wqe_local_inval_seg) / 16;
break;
- }
- break;
-
- case IBV_QPT_UD:
- set_datagram_seg(wqe, wr);
- wqe += sizeof (struct mlx4_wqe_datagram_seg);
- size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
- if (to_mah(wr->wr.ud.ah)->tagged) {
- ctrl->ins_vlan = 1 << 6;
- ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan);
- }
- break;
+ case IBV_EXP_WR_BIND_MW:
+ u.srcrb_flags |= htonl(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_bind_seg(wqe, wr);
+ wqe += sizeof
+ (struct mlx4_wqe_bind_seg);
+ size += sizeof
+ (struct mlx4_wqe_bind_seg) / 16;
+ break;
- default:
- break;
- }
+ case IBV_EXP_WR_SEND:
+ if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) {
+
+ if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER ||
+ (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER ||
+ (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER ||
+ !mlx4_calc_ops_table
+ [wr->op.calc.data_size]
+ [wr->op.calc.calc_op]
+ [wr->op.calc.data_type].valid) {
+ ret = -1;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ mlx4_wr_op = MLX4_OPCODE_CALC_SEND |
+ mlx4_calc_ops_table
+ [wr->op.calc.data_size]
+ [wr->op.calc.calc_op]
+ [wr->op.calc.data_type].opcode;
+ }
- if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
- struct mlx4_wqe_inline_seg *seg;
- void *addr;
- int len, seg_len;
- int num_seg;
- int off, to_copy;
+ break;
- inl = 0;
+ case IBV_EXP_WR_CQE_WAIT:
+ {
+ struct mlx4_cq *wait_cq = to_mcq(wr->task.cqe_wait.cq);
+ uint32_t wait_index = 0;
- seg = wqe;
- wqe += sizeof *seg;
- off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
- num_seg = 0;
- seg_len = 0;
+ wait_index = wait_cq->wait_index +
+ wr->task.cqe_wait.cq_count;
+ wait_cq->wait_count = max(wait_cq->wait_count,
+ wr->task.cqe_wait.cq_count);
- for (i = 0; i < wr->num_sge; ++i) {
- addr = (void *) (uintptr_t) wr->sg_list[i].addr;
- len = wr->sg_list[i].length;
- inl += len;
+ if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) {
+ wait_cq->wait_index += wait_cq->wait_count;
+ wait_cq->wait_count = 0;
+ }
- if (inl > qp->max_inline_data) {
- inl = 0;
- ret = -1;
- *bad_wr = wr;
- goto out;
+ set_wait_en_seg(wqe, wait_cq->cqn, wait_index);
+ wqe += sizeof(struct mlx4_wqe_wait_en_seg);
+ size += sizeof(struct mlx4_wqe_wait_en_seg) / 16;
}
+ break;
- while (len >= MLX4_INLINE_ALIGN - off) {
- to_copy = MLX4_INLINE_ALIGN - off;
- memcpy(wqe, addr, to_copy);
- len -= to_copy;
- wqe += to_copy;
- addr += to_copy;
- seg_len += to_copy;
- wmb(); /* see comment below */
- seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
- seg_len = 0;
- seg = wqe;
- wqe += sizeof *seg;
- off = sizeof *seg;
- ++num_seg;
+ case IBV_EXP_WR_SEND_ENABLE:
+ case IBV_EXP_WR_RECV_ENABLE:
+ {
+ unsigned head_en_index;
+ struct mlx4_wq *wq;
+
+ /*
+ * Posting work request for QP that does not support
+ * SEND/RECV ENABLE makes performance worse.
+ */
+ if (((wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) &&
+ !(to_mqp(wr->task.wqe_enable.qp)->create_flags &
+ IBV_EXP_QP_CREATE_MANAGED_SEND)) ||
+ ((wr->exp_opcode == IBV_EXP_WR_RECV_ENABLE) &&
+ !(to_mqp(wr->task.wqe_enable.qp)->create_flags &
+ IBV_EXP_QP_CREATE_MANAGED_RECV))) {
+ ret = -1;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wq = (wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) ?
+ &to_mqp(wr->task.wqe_enable.qp)->sq :
+ &to_mqp(wr->task.wqe_enable.qp)->rq;
+
+ /* If wqe_count is 0 release all WRs from queue */
+ if (wr->task.wqe_enable.wqe_count) {
+ head_en_index = wq->head_en_index +
+ wr->task.wqe_enable.wqe_count;
+ wq->head_en_count = max(wq->head_en_count,
+ wr->task.wqe_enable.wqe_count);
+
+ if ((int)(wq->head - head_en_index) < 0) {
+ ret = -1;
+ *bad_wr = wr;
+ goto out;
+ }
+ } else {
+ head_en_index = wq->head;
+ wq->head_en_count = wq->head - wq->head_en_index;
+ }
+
+ if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) {
+ wq->head_en_index += wq->head_en_count;
+ wq->head_en_count = 0;
+ }
+
+ set_wait_en_seg(wqe,
+ wr->task.wqe_enable.qp->qp_num,
+ head_en_index);
+
+ wqe += sizeof(struct mlx4_wqe_wait_en_seg);
+ size += sizeof(struct mlx4_wqe_wait_en_seg) / 16;
}
+ break;
- memcpy(wqe, addr, len);
- wqe += len;
- seg_len += len;
- off += len;
- }
+ case IBV_EXP_WR_SEND_WITH_INV:
+ imm = htonl(wr->ex.invalidate_rkey);
+ break;
- if (seg_len) {
- ++num_seg;
- /*
- * Need a barrier here to make sure
- * all the data is visible before the
- * byte_count field is set. Otherwise
- * the HCA prefetcher could grab the
- * 64-byte chunk with this inline
- * segment and get a valid (!=
- * 0xffffffff) byte count but stale
- * data, and end up sending the wrong
- * data.
- */
- wmb();
- seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+ default:
+ /* No extra segments required for sends */
+ break;
}
+ break;
- size += (inl + num_seg * sizeof * seg + 15) / 16;
- } else {
- struct mlx4_wqe_data_seg *seg = wqe;
+ case IBV_QPT_UD:
+ set_datagram_seg(wqe, (struct ibv_send_wr *)wr);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+ break;
- for (i = wr->num_sge - 1; i >= 0 ; --i)
- set_data_seg(seg + i, wr->sg_list + i);
+ case IBV_QPT_RAW_PACKET:
+ /* Sanity check - prevent from posting empty SR */
+ if (unlikely(!wr->num_sge)) {
+ ret = EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+ if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) {
+ /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+ * to indicate that no icrc should be calculated */
+ u.srcrb_flags |= htonl(MLX4_WQE_CTRL_SOLICIT);
+ /* For raw eth, take the dmac from the payload */
+ u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)wr->sg_list[0].addr;
+ imm = *(uint32_t *)((uintptr_t)(wr->sg_list[0].addr)+2);
+ }
+ break;
- size += wr->num_sge * (sizeof *seg / 16);
+ default:
+ break;
}
- ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
- MLX4_WQE_CTRL_FENCE : 0) | size;
-
- /*
- * Make sure descriptor is fully written before
- * setting ownership bit (because HW can start
- * executing as soon as we do).
- */
- wmb();
-
- ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
- (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
+ ret = set_data_seg(qp, wqe, &size, !!(exp_send_flags & IBV_EXP_SEND_INLINE),
+ wr->num_sge, wr->sg_list, &inl, owner_bit);
+ if (unlikely(ret)) {
+ inl = 0;
+ *bad_wr = wr;
+ goto out;
+ }
+ set_ctrl_seg(ctrl, (struct ibv_send_wr *)wr, qp, imm, u.srcrb_flags, owner_bit, size, mlx4_wr_op);
/*
* We can improve latency by not stamping the last
* send queue WQE until after ringing the doorbell, so
* only stamp here if there are still more WQEs to post.
*/
- if (wr->next)
+ if (likely(wr->next))
+#ifndef MLX4_WQE_FORMAT
stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
(qp->sq.wqe_cnt - 1));
-
+#else
+ set_owner_wqe(qp, ind, size, owner_bit);
+#endif
++ind;
}
out:
- ctx = to_mctx(ibqp->context);
-
- if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
- ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
- *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
- /*
- * Make sure that descriptor is written to memory
- * before writing to BlueFlame page.
- */
- wmb();
-
- ++qp->sq.head;
-
- pthread_spin_lock(&ctx->bf_lock);
-
- mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
- align(size * 16, 64));
- wc_wmb();
-
- ctx->bf_offset ^= ctx->bf_buf_size;
-
- pthread_spin_unlock(&ctx->bf_lock);
- } else if (nreq) {
- qp->sq.head += nreq;
-
- /*
- * Make sure that descriptors are written before
- * doorbell record.
- */
- wmb();
-
- *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
- }
-
- if (nreq)
+ ring_db(qp, ctrl, nreq, size, inl);
+ if (likely(nreq))
+#ifndef MLX4_WQE_FORMAT
stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
(qp->sq.wqe_cnt - 1));
+#else
+ set_owner_wqe(qp, ind - 1, size, owner_bit);
+#endif
- pthread_spin_unlock(&qp->sq.lock);
+ mlx4_unlock(&qp->sq.lock);
return ret;
}
+
+
int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
struct ibv_recv_wr **bad_wr)
{
@@ -449,24 +1499,25 @@
struct mlx4_wqe_data_seg *scat;
int ret = 0;
int nreq;
- int ind;
+ unsigned int ind;
int i;
+ struct mlx4_inlr_rbuff *rbuffs;
- pthread_spin_lock(&qp->rq.lock);
+ mlx4_lock(&qp->rq.lock);
/* XXX check that state is OK to post receive */
-
ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
- ret = -1;
+ if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW) &&
+ wq_overflow(&qp->rq, nreq, qp))) {
+ ret = ENOMEM;
*bad_wr = wr;
goto out;
}
- if (wr->num_sge > qp->rq.max_gs) {
- ret = -1;
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ ret = EINVAL;
*bad_wr = wr;
goto out;
}
@@ -476,11 +1527,20 @@
for (i = 0; i < wr->num_sge; ++i)
__set_data_seg(scat + i, wr->sg_list + i);
- if (i < qp->rq.max_gs) {
+ if (likely(i < qp->rq.max_gs)) {
scat[i].byte_count = 0;
scat[i].lkey = htonl(MLX4_INVALID_LKEY);
scat[i].addr = 0;
}
+ if (qp->max_inlr_sg) {
+ rbuffs = qp->inlr_buff.buff[ind].sg_list;
+ qp->inlr_buff.buff[ind].list_len = wr->num_sge;
+ for (i = 0; i < wr->num_sge; ++i) {
+ rbuffs->rbuff = (void *)(unsigned long)(wr->sg_list[i].addr);
+ rbuffs->rlen = wr->sg_list[i].length;
+ rbuffs++;
+ }
+ }
qp->rq.wrid[ind] = wr->wr_id;
@@ -488,7 +1548,7 @@
}
out:
- if (nreq) {
+ if (likely(nreq)) {
qp->rq.head += nreq;
/*
@@ -500,7 +1560,7 @@
*qp->db = htonl(qp->rq.head & 0xffff);
}
- pthread_spin_unlock(&qp->rq.lock);
+ mlx4_unlock(&qp->rq.lock);
return ret;
}
@@ -533,6 +1593,7 @@
struct mlx4_qp *qp)
{
int size;
+ int atomic_size;
int max_sq_sge;
max_sq_sge = align(cap->max_inline_data +
@@ -553,6 +1614,7 @@
size += sizeof (struct mlx4_wqe_raddr_seg);
break;
+ case IBV_QPT_XRC_SEND:
case IBV_QPT_XRC:
case IBV_QPT_RC:
size += sizeof (struct mlx4_wqe_raddr_seg);
@@ -560,12 +1622,14 @@
* An atomic op will require an atomic segment, a
* remote address segment and one scatter entry.
*/
- if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
- sizeof (struct mlx4_wqe_raddr_seg) +
- sizeof (struct mlx4_wqe_data_seg)))
- size = (sizeof (struct mlx4_wqe_atomic_seg) +
- sizeof (struct mlx4_wqe_raddr_seg) +
- sizeof (struct mlx4_wqe_data_seg));
+ atomic_size = (qp->is_masked_atomic ?
+ sizeof(struct mlx4_wqe_masked_atomic_seg) :
+ sizeof(struct mlx4_wqe_atomic_seg)) +
+ sizeof(struct mlx4_wqe_raddr_seg) +
+ sizeof(struct mlx4_wqe_data_seg);
+
+ if (size < atomic_size)
+ size = atomic_size;
break;
default:
@@ -583,56 +1647,39 @@
; /* nothing */
}
-int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
- enum ibv_qp_type type, struct mlx4_qp *qp)
+int mlx4_use_huge(struct ibv_context *context, const char *key)
{
- qp->rq.max_gs = cap->max_recv_sge;
+ char e[VERBS_MAX_ENV_VAL];
- qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
- if (!qp->sq.wrid)
- return -1;
+ if (!ibv_exp_cmd_getenv(context, key, e, sizeof(e)) && !strcmp(e, "y"))
+ return 1;
+ return 0;
+}
+
+void mlx4_dealloc_qp_buf(struct ibv_context *context, struct mlx4_qp *qp)
+{
if (qp->rq.wqe_cnt) {
- qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
- if (!qp->rq.wrid) {
- free(qp->sq.wrid);
- return -1;
+ free(qp->rq.wrid);
+ if (qp->max_inlr_sg) {
+ free(qp->inlr_buff.buff[0].sg_list);
+ free(qp->inlr_buff.buff);
}
}
-
- for (qp->rq.wqe_shift = 4;
- 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
- qp->rq.wqe_shift++)
- ; /* nothing */
-
- qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
- (qp->sq.wqe_cnt << qp->sq.wqe_shift);
- if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
- qp->rq.offset = 0;
- qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
- } else {
- qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
- qp->sq.offset = 0;
- }
-
- if (mlx4_alloc_buf(&qp->buf,
- align(qp->buf_size, to_mdev(pd->context->device)->page_size),
- to_mdev(pd->context->device)->page_size)) {
+ if (qp->sq.wqe_cnt)
free(qp->sq.wrid);
- free(qp->rq.wrid);
- return -1;
- }
- memset(qp->buf.buf, 0, qp->buf_size);
-
- return 0;
+ if (qp->buf.hmem != NULL)
+ mlx4_free_buf_huge(to_mctx(context), &qp->buf);
+ else
+ mlx4_free_buf(&qp->buf);
}
void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
enum ibv_qp_type type)
{
int wqe_size;
- struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context);
+ struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context);
wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) -
sizeof (struct mlx4_wqe_ctrl_seg);
@@ -641,9 +1688,10 @@
wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
break;
+ case IBV_QPT_XRC_SEND:
+ case IBV_QPT_XRC:
case IBV_QPT_UC:
case IBV_QPT_RC:
- case IBV_QPT_XRC:
wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
break;
@@ -704,3 +1752,812 @@
else
ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
}
+
+int mlx4_post_task(struct ibv_context *context,
+ struct ibv_exp_task *task_list,
+ struct ibv_exp_task **bad_task)
+{
+ int rc = 0;
+ struct ibv_exp_task *cur_task = NULL;
+ struct ibv_exp_send_wr *bad_wr;
+ struct mlx4_context *mlx4_ctx = to_mctx(context);
+
+ if (!task_list)
+ return rc;
+
+ pthread_mutex_lock(&mlx4_ctx->task_mutex);
+
+ cur_task = task_list;
+ while (!rc && cur_task) {
+
+ switch (cur_task->task_type) {
+ case IBV_EXP_TASK_SEND:
+ rc = ibv_exp_post_send(cur_task->item.qp,
+ cur_task->item.send_wr,
+ &bad_wr);
+ break;
+
+ case IBV_EXP_TASK_RECV:
+ rc = ibv_post_recv(cur_task->item.qp,
+ cur_task->item.recv_wr,
+ NULL);
+ break;
+
+ default:
+ rc = -1;
+ }
+
+ if (rc && bad_task) {
+ *bad_task = cur_task;
+ break;
+ }
+
+ cur_task = cur_task->next;
+ }
+
+ pthread_mutex_unlock(&mlx4_ctx->task_mutex);
+
+ return rc;
+}
+
+/*
+ * family interfaces functions
+ */
+
+/*
+ * send_pending - is a general post send function that put one message in
+ * the send queue. The function is not ringing the QP door-bell.
+ *
+ * User may call this function several times to fill send queue with
+ * several messages, then he can call mlx4_send_flush to ring the QP DB
+ *
+ * This function is used to implement the following QP burst family functions:
+ * - send_pending
+ * - send_pending_inline
+ * - send_pending_sg_list
+ * - send_burst
+ */
+static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr,
+ uint32_t length, uint32_t lkey,
+ uint32_t flags,
+ const int use_raw_eth, const int use_inl,
+ const int thread_safe, const int wqe_64,
+ const int use_sg_list, int num_sge,
+ struct ibv_sge *sg_list,
+ const int lb) __attribute__((always_inline));
+static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr,
+ uint32_t length, uint32_t lkey,
+ uint32_t flags,
+ const int use_raw_eth, const int use_inl,
+ const int thread_safe, const int wqe_64,
+ const int use_sg_list, int num_sge,
+ struct ibv_sge *sg_list,
+ const int lb)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_data_seg *dseg;
+ uint32_t tunnel_offload = 0;
+ unsigned int owner_bit = qp->sq.head & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0;
+ int size;
+ int idx;
+ int i;
+
+ if (thread_safe)
+ mlx4_lock(&qp->sq.lock);
+
+ if (wqe_64)
+ ctrl = get_send_wqe64(qp, qp->sq.head & (qp->sq.wqe_cnt - 1));
+ else
+ ctrl = get_send_wqe(qp, qp->sq.head & (qp->sq.wqe_cnt - 1));
+
+ dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) + sizeof(struct mlx4_wqe_ctrl_seg));
+
+ if (use_sg_list) {
+ for (i = num_sge - 1; i >= 0 ; --i)
+ set_ptr_data(dseg + i, sg_list + i, owner_bit);
+
+ size = (sizeof(struct mlx4_wqe_ctrl_seg) + (num_sge * sizeof(struct mlx4_wqe_data_seg)))/ 16;
+ } else {
+ if (use_inl) {
+ size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+ set_data_inl_seg_fast(qp, (void *)(uintptr_t)addr, length, dseg, &size, owner_bit);
+ } else {
+ size = (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))/ 16;
+ dseg->byte_count = SET_BYTE_COUNT(length);
+ dseg->lkey = htonl(lkey);
+ dseg->addr = htonll(addr);
+ }
+ }
+
+ if (use_raw_eth) {
+ /* For raw eth, the SOLICIT flag is used
+ * to indicate that no icrc should be calculated */
+ idx = IBV_EXP_QP_BURST_SOLICITED |
+ (flags & (IBV_EXP_QP_BURST_SIGNALED |
+ IBV_EXP_QP_BURST_IP_CSUM |
+ IBV_EXP_QP_BURST_TUNNEL));
+ tunnel_offload = flags & IBV_EXP_QP_BURST_TUNNEL ? MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_IL4 : 0;
+ } else {
+ idx = (flags & (IBV_EXP_QP_BURST_SIGNALED |
+ IBV_EXP_QP_BURST_SOLICITED |
+ IBV_EXP_QP_BURST_IP_CSUM));
+ }
+
+ if (use_raw_eth && lb) {
+ union {
+ uint32_t srcrb_flags;
+ uint16_t srcrb_flags16[2];
+ } u;
+
+ u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+ /* For raw eth, take the dmac from the payload */
+ if (use_sg_list)
+ addr = sg_list[0].addr;
+ u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)addr;
+ ctrl->srcrb_flags = u.srcrb_flags;
+ ctrl->imm = *(uint32_t *)((uintptr_t)(addr)+2);
+ } else {
+ ctrl->srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+ ctrl->imm = 0;
+ }
+ ctrl->fence_size = (flags & IBV_EXP_QP_BURST_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size;
+
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ wmb();
+
+ ctrl->owner_opcode = htonl(MLX4_OPCODE_SEND | tunnel_offload) | owner_bit;
+ qp->sq.head++;
+
+ if (!wqe_64)
+#ifndef MLX4_WQE_FORMAT
+ stamp_send_wqe(qp, (qp->sq.head + qp->sq_spare_wqes) &
+ (qp->sq.wqe_cnt - 1));
+#else
+ set_owner_wqe(qp, qp->sq.head, size, owner_bit);
+#endif
+ if (thread_safe)
+ mlx4_unlock(&qp->sq.lock);
+ else
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ return 0;
+}
+
+/* burst family - send_pending */
+static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr,
+ uint32_t length, uint32_t lkey,
+ uint32_t flags, const int lb) __attribute__((always_inline));
+static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr,
+ uint32_t length, uint32_t lkey,
+ uint32_t flags, const int lb)
+{
+ struct mlx4_qp *mqp = to_mqp(qp);
+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET &&
+ mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+ int wqe_64 = mqp->sq.wqe_shift == 6;
+
+ /* qp, addr, length, lkey, flags, raw_eth, inl, safe, */
+ return send_pending(qp, addr, length, lkey, flags, raw_eth, 0, 1,
+ /* wqe_64, use_sg, num_sge, sg_list, lb */
+ wqe_64, 0, 0, NULL, lb);
+}
+
+static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags)
+{
+ return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 1);
+}
+
+static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags)
+{
+ return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 0);
+}
+
+#define MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_PENDING_UNSAFE(eth, wqe64, lb) \
+ static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *qp, uint64_t addr, \
+ uint32_t length, uint32_t lkey, \
+ uint32_t flags) __MLX4_ALGN_FUNC__; \
+ static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *qp, uint64_t addr, \
+ uint32_t length, uint32_t lkey, \
+ uint32_t flags) \
+ { \
+ /* qp, addr, length, lkey, flags, eth, inl, */ \
+ return send_pending(qp, addr, length, lkey, flags, eth, 0, \
+ /* safe, wqe_64, use_sg, num_sge, sg_list */ \
+ 0, wqe64, 0, 0, NULL, \
+ /* lb */ \
+ lb); \
+ }
+/* eth, wqe64, lb */
+MLX4_SEND_PENDING_UNSAFE(0, 0, 0);
+MLX4_SEND_PENDING_UNSAFE(0, 0, 1);
+MLX4_SEND_PENDING_UNSAFE(0, 1, 0);
+MLX4_SEND_PENDING_UNSAFE(0, 1, 1);
+MLX4_SEND_PENDING_UNSAFE(1, 0, 0);
+MLX4_SEND_PENDING_UNSAFE(1, 0, 1);
+MLX4_SEND_PENDING_UNSAFE(1, 1, 0);
+MLX4_SEND_PENDING_UNSAFE(1, 1, 1);
+
+/* burst family - send_pending_inline */
+static inline int mlx4_send_pending_inl_safe(struct ibv_qp *qp, void *addr,
+ uint32_t length, uint32_t flags,
+ const int lb) __attribute__((always_inline));
+static inline int mlx4_send_pending_inl_safe(struct ibv_qp *qp, void *addr,
+ uint32_t length, uint32_t flags,
+ const int lb)
+{
+ struct mlx4_qp *mqp = to_mqp(qp);
+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+ int wqe_64 = mqp->sq.wqe_shift == 6;
+
+ /* qp, addr, length, lkey, flags, raw_eth, */
+ return send_pending(qp, (uintptr_t)addr, length, 0, flags, raw_eth,
+ /* inl, safe, wqe_64, use_sg, num_sge, sg_list, lb */
+ 1, 1, wqe_64, 0, 0, NULL, lb);
+}
+
+static int mlx4_send_pending_inl_safe_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_inl_safe_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags)
+{
+ return mlx4_send_pending_inl_safe(qp, addr, length, flags, 1);
+}
+
+static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags)
+{
+ return mlx4_send_pending_inl_safe(qp, addr, length, flags, 0);
+}
+
+#define MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_inl_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_PENDING_INL_UNSAFE(eth, wqe64, lb) \
+ static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *qp, void *addr, \
+ uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__; \
+ static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *qp, void *addr, \
+ uint32_t length, uint32_t flags) \
+ { \
+ /* qp, addr, length, lkey, flags, eth, inl, */ \
+ return send_pending(qp, (uintptr_t)addr, length, 0, flags, eth, 1, \
+ /* safe, wqe_64, use_sg, num_sge, sg_list, lb */ \
+ 0, wqe64, 0, 0, NULL, lb); \
+ }
+/* eth, wqe64, lb */
+MLX4_SEND_PENDING_INL_UNSAFE(0, 0, 0);
+MLX4_SEND_PENDING_INL_UNSAFE(0, 0, 1);
+MLX4_SEND_PENDING_INL_UNSAFE(0, 1, 0);
+MLX4_SEND_PENDING_INL_UNSAFE(0, 1, 1);
+MLX4_SEND_PENDING_INL_UNSAFE(1, 0, 0);
+MLX4_SEND_PENDING_INL_UNSAFE(1, 0, 1);
+MLX4_SEND_PENDING_INL_UNSAFE(1, 1, 0);
+MLX4_SEND_PENDING_INL_UNSAFE(1, 1, 1);
+
+/* burst family - send_pending_sg_list */
+static inline int mlx4_send_pending_sg_list_safe(
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ uint32_t flags, const int lb) __attribute__((always_inline));
+static inline int mlx4_send_pending_sg_list_safe(
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ uint32_t flags, const int lb)
+{
+ struct mlx4_qp *mqp = to_mqp(ibqp);
+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+ int wqe_64 = mqp->sq.wqe_shift == 6;
+
+ /* qp, addr, length, lkey, flags, raw_eth, inl, */
+ return send_pending(ibqp, 0, 0, 0, flags, raw_eth, 0,
+ /* safe, wqe_64, use_sg, num_sge, sg_list, lb */
+ 1, wqe_64, 1, num, sg_list, lb);
+}
+static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+ return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 1);
+}
+
+static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+ return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 0);
+}
+
+#define MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_sg_list_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_PENDING_SG_LIST_UNSAFE(eth, wqe64, lb) \
+ static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \
+ static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num, uint32_t flags) \
+ { \
+ /* qp, addr, length, lkey, flags, eth, inl, */ \
+ return send_pending(ibqp, 0, 0, 0, flags, eth, 0, \
+ /* safe, wqe_64, use_sg, num_sge, sg_list, lb */ \
+ 0, wqe64, 1, num, sg_list, lb); \
+ }
+/* eth, wqe64, lb */
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 0, 0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 0, 1);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 1, 0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 1, 1);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 0, 0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 0, 1);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 1, 0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 1, 1);
+
+static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64) __attribute__((always_inline));
+/* burst family - send_burst */
+static inline int send_msg_list(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ uint32_t flags, const int raw_eth, const int thread_safe,
+ const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb) __attribute__((always_inline));
+static inline int send_msg_list(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ uint32_t flags, const int raw_eth, const int thread_safe,
+ const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ int i;
+
+ if (unlikely(thread_safe))
+ mlx4_lock(&qp->sq.lock);
+
+ for (i = 0; i < num; i++, sg_list++)
+ /* qp, addr, length, lkey, */
+ send_pending(ibqp, sg_list->addr, sg_list->length, sg_list->lkey,
+ /* flags, raw_eth, inl, safe, wqe_64, use_sg, */
+ flags, raw_eth, 0, 0, wqe_64, 0,
+ /* num_sge, sg_list, lb */
+ 0, NULL, lb);
+
+ if (use_bf)
+ /* use send_flush_unsafe since lock is already taken if needed */
+ send_flush_unsafe(ibqp, _1thrd_evict, wqe_64);
+ else
+ *qp->sdb = qp->doorbell_qpn;
+
+ if (unlikely(thread_safe))
+ mlx4_unlock(&qp->sq.lock);
+
+ return 0;
+}
+
+static inline int mlx4_send_burst_safe(
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ uint32_t flags, const int lb) __attribute__((always_inline));
+static inline int mlx4_send_burst_safe(
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ uint32_t flags, const int lb)
+{
+ struct mlx4_qp *mqp = to_mqp(ibqp);
+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+ int wqe_64 = mqp->sq.wqe_shift == 6;
+ int _1thrd_evict = mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB ||
+ mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+ int use_bf = mqp->db_method != MLX4_QP_DB_METHOD_DB;
+
+ return send_msg_list(ibqp, sg_list, num, flags, raw_eth, 1, wqe_64, use_bf, _1thrd_evict, lb);
+}
+
+static int mlx4_send_burst_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_burst_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+ return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 1);
+}
+
+static int mlx4_send_burst_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_burst_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+ return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 0);
+}
+
+#define MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb) mlx4_send_burst_unsafe_##_1thrd_evict##eth##wqe64##lb
+#define MLX4_SEND_BURST_UNSAFE(_1thrd_evict, eth, wqe64, lb) \
+ static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \
+ static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num, uint32_t flags) \
+ { \
+ return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 1, _1thrd_evict, \
+ lb); \
+ }
+/* _1thrd_evict, eth, wqe64, lb */
+MLX4_SEND_BURST_UNSAFE(0, 0, 0, 0);
+MLX4_SEND_BURST_UNSAFE(0, 0, 0, 1);
+MLX4_SEND_BURST_UNSAFE(0, 0, 1, 0);
+MLX4_SEND_BURST_UNSAFE(0, 0, 1, 1);
+MLX4_SEND_BURST_UNSAFE(0, 1, 0, 0);
+MLX4_SEND_BURST_UNSAFE(0, 1, 0, 1);
+MLX4_SEND_BURST_UNSAFE(0, 1, 1, 0);
+MLX4_SEND_BURST_UNSAFE(0, 1, 1, 1);
+MLX4_SEND_BURST_UNSAFE(1, 0, 0, 0);
+MLX4_SEND_BURST_UNSAFE(1, 0, 0, 1);
+MLX4_SEND_BURST_UNSAFE(1, 0, 1, 0);
+MLX4_SEND_BURST_UNSAFE(1, 0, 1, 1);
+MLX4_SEND_BURST_UNSAFE(1, 1, 0, 0);
+MLX4_SEND_BURST_UNSAFE(1, 1, 0, 1);
+MLX4_SEND_BURST_UNSAFE(1, 1, 1, 0);
+MLX4_SEND_BURST_UNSAFE(1, 1, 1, 1);
+
+#define MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb) mlx4_send_burst_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_BURST_UNSAFE_DB(eth, wqe64, lb) \
+ static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \
+ static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num, uint32_t flags) \
+ { \
+ return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 0, 0, lb); \
+ }
+/* eth, wqe64, lb */
+MLX4_SEND_BURST_UNSAFE_DB(0, 0, 0);
+MLX4_SEND_BURST_UNSAFE_DB(0, 0, 1);
+MLX4_SEND_BURST_UNSAFE_DB(0, 1, 0);
+MLX4_SEND_BURST_UNSAFE_DB(0, 1, 1);
+MLX4_SEND_BURST_UNSAFE_DB(1, 0, 0);
+MLX4_SEND_BURST_UNSAFE_DB(1, 0, 1);
+MLX4_SEND_BURST_UNSAFE_DB(1, 1, 0);
+MLX4_SEND_BURST_UNSAFE_DB(1, 1, 1);
+
+/* burst family - send_flush */
+static int mlx4_send_flush_db(struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__;
+static int mlx4_send_flush_db(struct ibv_qp *ibqp)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+
+ *qp->sdb = qp->doorbell_qpn;
+
+ return 0;
+}
+
+static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+
+ if (qp->last_db_head + 1 == qp->sq.head) {
+ struct mlx4_wqe_ctrl_seg *ctrl = get_send_wqe(qp, qp->last_db_head & (qp->sq.wqe_cnt - 1));
+ int size = ctrl->fence_size & 0x3f;
+
+ /*
+ * There is no need to check that size > 1 since we get here only
+ * after using send_pending function, this guarantee that size > 1
+ */
+ if (wqe64)
+ copy_wqe_to_bf(qp, ctrl, 64, qp->last_db_head,
+ 1, _1thrd_evict);
+ else if (size <= qp->bf_buf_size / 16)
+ copy_wqe_to_bf(qp, ctrl, align(size * 16, 64),
+ qp->last_db_head,
+ 1, _1thrd_evict);
+ else
+ *qp->sdb = qp->doorbell_qpn;
+ } else {
+ *qp->sdb = qp->doorbell_qpn;
+ }
+ qp->last_db_head = qp->sq.head;
+
+ return 0;
+}
+
+#define MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64) mlx4_send_flush_unsafe_##_1thrd_evict##wqe64
+#define MLX4_SEND_FLUSH_UNSAFE(_1thrd_evict, wqe64) \
+ static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)( \
+ struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__; \
+ static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)( \
+ struct ibv_qp *ibqp) \
+ { \
+ return send_flush_unsafe(ibqp, _1thrd_evict, wqe64); \
+ }
+
+/* _1thrd_evict, wqe64 */
+MLX4_SEND_FLUSH_UNSAFE(0, 0);
+MLX4_SEND_FLUSH_UNSAFE(1, 0);
+MLX4_SEND_FLUSH_UNSAFE(0, 1);
+MLX4_SEND_FLUSH_UNSAFE(1, 1);
+
+/* burst family - recv_burst */
+static inline int recv_burst(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ const int thread_safe, const int use_inlne_recv, const int max_one_sge) __attribute__((always_inline));
+static inline int recv_burst(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+ const int thread_safe, const int use_inlne_recv, const int max_one_sge)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+ struct mlx4_wqe_data_seg *scat;
+ struct mlx4_inlr_rbuff *rbuffs;
+ unsigned int ind;
+ int i;
+
+ if (thread_safe)
+ mlx4_lock(&qp->rq.lock);
+
+ for (i = 0; i < num; ++i) {
+ ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+ scat = get_recv_wqe(qp, ind);
+ __set_data_seg(scat, sg_list);
+
+ if (!max_one_sge) {
+ scat[1].byte_count = 0;
+ scat[1].lkey = htonl(MLX4_INVALID_LKEY);
+ scat[1].addr = 0;
+ }
+
+ if (use_inlne_recv) {
+ rbuffs = qp->inlr_buff.buff[ind].sg_list;
+ qp->inlr_buff.buff[ind].list_len = 1;
+ rbuffs->rbuff = (void *)(unsigned long)(sg_list->addr);
+ rbuffs->rlen = sg_list->length;
+ rbuffs++;
+ }
+ sg_list++;
+ qp->rq.head++;
+ }
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *qp->db = htonl(qp->rq.head & 0xffff);
+
+ if (thread_safe)
+ mlx4_unlock(&qp->rq.lock);
+
+ return 0;
+}
+
+static int mlx4_recv_burst_safe(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num) __MLX4_ALGN_FUNC__;
+static int mlx4_recv_burst_safe(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num)
+{
+ struct mlx4_qp *qp = to_mqp(ibqp);
+
+ return recv_burst(ibqp, sg_list, num, 1, qp->max_inlr_sg, qp->rq.max_gs == 1);
+}
+#define MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge) mlx4_recv_burst_unsafe_##inlr##_1sge
+#define MLX4_RECV_BURST_UNSAFE(inlr, _1sge) \
+ static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num) __MLX4_ALGN_FUNC__; \
+ static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)( \
+ struct ibv_qp *ibqp, struct ibv_sge *sg_list, \
+ uint32_t num) \
+ { \
+ return recv_burst(ibqp, sg_list, num, 0, inlr, _1sge); \
+ }
+/* inlr, _1sge */
+MLX4_RECV_BURST_UNSAFE(0, 0);
+MLX4_RECV_BURST_UNSAFE(1, 0);
+MLX4_RECV_BURST_UNSAFE(0, 1);
+MLX4_RECV_BURST_UNSAFE(1, 1);
+
+/*
+ * qp_burst family implementation for safe QP
+ */
+struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_lb = {
+ .send_burst = mlx4_send_burst_safe_lb,
+ .send_pending = mlx4_send_pending_safe_lb,
+ .send_pending_inline = mlx4_send_pending_inl_safe_lb,
+ .send_pending_sg_list = mlx4_send_pending_sg_list_safe_lb,
+ .recv_burst = mlx4_recv_burst_safe,
+ .send_flush = mlx4_send_flush_db
+};
+
+struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_no_lb = {
+ .send_burst = mlx4_send_burst_safe_no_lb,
+ .send_pending = mlx4_send_pending_safe_no_lb,
+ .send_pending_inline = mlx4_send_pending_inl_safe_no_lb,
+ .send_pending_sg_list = mlx4_send_pending_sg_list_safe_no_lb,
+ .recv_burst = mlx4_recv_burst_safe,
+ .send_flush = mlx4_send_flush_db
+};
+
+/*
+ * qp_burst family implementation table for unsafe QP
+ */
+#define MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge) \
+ (lb << 5 | _1thrd_evict << 4 | eth << 3 | wqe64 << 2 | inlr << 1 | _1sge)
+
+#define MLX4_QP_BURST_UNSAFE_TBL_ENTRY(lb, _1thrd_evict, eth, wqe64, inlr, _1sge) \
+ [MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)] = { \
+ .send_burst = MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb), \
+ .send_pending = MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb), \
+ .send_pending_inline = MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb), \
+ .send_pending_sg_list = MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb), \
+ .recv_burst = MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge), \
+ .send_flush = MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64), \
+ }
+static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_tbl[1 << 6] = {
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 1),
+};
+
+#define MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge) \
+ (lb << 4 | eth << 3 | wqe64 << 2 | inlr << 1 | _1sge)
+
+#define MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(lb, eth, wqe64, inlr, _1sge) \
+ [MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)] = { \
+ .send_burst = MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb), \
+ .send_pending = MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb), \
+ .send_pending_inline = MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb), \
+ .send_pending_sg_list = MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb), \
+ .recv_burst = MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge), \
+ .send_flush = mlx4_send_flush_db, \
+ }
+static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_db_tbl[1 << 5] = {
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 1),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 0),
+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 1),
+};
+
+struct ibv_exp_qp_burst_family *mlx4_get_qp_burst_family(struct mlx4_qp *qp,
+ struct ibv_exp_query_intf_params *params,
+ enum ibv_exp_query_intf_status *status)
+{
+ enum ibv_exp_query_intf_status ret = IBV_EXP_INTF_STAT_OK;
+ struct ibv_exp_qp_burst_family *family = NULL;
+ uint32_t unsupported_f;
+
+ if ((qp->verbs_qp.qp.state < IBV_QPS_INIT) || (qp->verbs_qp.qp.state > IBV_QPS_RTS)) {
+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ_STATE;
+ return NULL;
+ }
+
+ if (params->flags) {
+ fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for QP family\n", params->flags);
+ *status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED;
+
+ return NULL;
+ }
+ unsupported_f = params->family_flags & ~(IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK |
+ IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR);
+ if (unsupported_f) {
+ fprintf(stderr, PFX "Family flags(0x%x) are not supported for QP family\n", unsupported_f);
+ *status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED;
+
+ return NULL;
+ }
+
+ switch (qp->qp_type) {
+ case IBV_QPT_RC:
+ case IBV_QPT_UC:
+ case IBV_QPT_RAW_PACKET:
+ if (qp->model_flags & MLX4_QP_MODEL_FLAG_THREAD_SAFE) {
+ int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK);
+
+ if (lb)
+ family = &mlx4_qp_burst_family_safe_lb;
+ else
+ family = &mlx4_qp_burst_family_safe_no_lb;
+ } else {
+ int eth = qp->qp_type == IBV_QPT_RAW_PACKET &&
+ qp->link_layer == IBV_LINK_LAYER_ETHERNET;
+ int wqe64 = qp->sq.wqe_shift == 6;
+ int inlr = qp->max_inlr_sg != 0;
+ int _1sge = qp->rq.max_gs == 1;
+ int _1thrd_evict = qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB ||
+ qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+ int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK);
+
+ if (qp->db_method == MLX4_QP_DB_METHOD_DB)
+ family = &mlx4_qp_burst_family_unsafe_db_tbl
+ [MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)];
+ else
+ family = &mlx4_qp_burst_family_unsafe_tbl
+ [MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)];
+ }
+ break;
+
+ default:
+ ret = IBV_EXP_INTF_STAT_INVAL_PARARM;
+ break;
+ }
+
+ *status = ret;
+
+ return family;
+}
Index: contrib/ofed/libmlx4/src/srq.c
===================================================================
--- contrib/ofed/libmlx4/src/srq.c
+++ contrib/ofed/libmlx4/src/srq.c
@@ -42,6 +42,7 @@
#include "mlx4.h"
#include "doorbell.h"
#include "wqe.h"
+#include "mlx4-abi.h"
static void *get_wqe(struct mlx4_srq *srq, int n)
{
@@ -52,38 +53,43 @@
{
struct mlx4_wqe_srq_next_seg *next;
- pthread_spin_lock(&srq->lock);
+ mlx4_spin_lock(&srq->lock);
next = get_wqe(srq, srq->tail);
next->next_wqe_index = htons(ind);
srq->tail = ind;
- pthread_spin_unlock(&srq->lock);
+ mlx4_spin_unlock(&srq->lock);
}
int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
struct ibv_recv_wr *wr,
struct ibv_recv_wr **bad_wr)
{
- struct mlx4_srq *srq = to_msrq(ibsrq);
+ struct mlx4_srq *srq;
struct mlx4_wqe_srq_next_seg *next;
struct mlx4_wqe_data_seg *scat;
int err = 0;
int nreq;
int i;
- pthread_spin_lock(&srq->lock);
+ if (ibsrq->handle == LEGACY_XRC_SRQ_HANDLE)
+ ibsrq = (struct ibv_srq *)(((struct ibv_srq_legacy *) ibsrq)->ibv_srq);
+ srq = to_msrq(ibsrq);
+ mlx4_spin_lock(&srq->lock);
for (nreq = 0; wr; ++nreq, wr = wr->next) {
if (wr->num_sge > srq->max_gs) {
- err = -1;
+ errno = EINVAL;
+ err = errno;
*bad_wr = wr;
break;
}
if (srq->head == srq->tail) {
/* SRQ is full*/
- err = -1;
+ errno = ENOMEM;
+ err = errno;
*bad_wr = wr;
break;
}
@@ -119,7 +125,7 @@
*srq->db = htonl(srq->counter);
}
- pthread_spin_unlock(&srq->lock);
+ mlx4_spin_unlock(&srq->lock);
return err;
}
@@ -174,52 +180,153 @@
return 0;
}
-struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size)
{
- int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
+ memset(xsrq_table, 0, sizeof *xsrq_table);
+ xsrq_table->num_xsrq = size;
+ xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS;
+ xsrq_table->mask = (1 << xsrq_table->shift) - 1;
- if (ctx->xrc_srq_table[tind].refcnt)
- return ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask];
- else
- return NULL;
+ pthread_mutex_init(&xsrq_table->mutex, NULL);
}
-int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
- struct mlx4_srq *srq)
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
{
- int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
- int ret = 0;
+ int index;
- pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ if (xsrq_table->xsrq_table[index].refcnt)
+ return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask];
+
+ return NULL;
+}
- if (!ctx->xrc_srq_table[tind].refcnt) {
- ctx->xrc_srq_table[tind].table = calloc(ctx->xrc_srq_table_mask + 1,
- sizeof(struct mlx4_srq *));
- if (!ctx->xrc_srq_table[tind].table) {
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq)
+{
+ int index, ret = 0;
+
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ pthread_mutex_lock(&xsrq_table->mutex);
+ if (!xsrq_table->xsrq_table[index].refcnt) {
+ xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1,
+ sizeof(struct mlx4_srq *));
+ if (!xsrq_table->xsrq_table[index].table) {
ret = -1;
goto out;
}
}
- ++ctx->xrc_srq_table[tind].refcnt;
- ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = srq;
+ xsrq_table->xsrq_table[index].refcnt++;
+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq;
out:
- pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
+ pthread_mutex_unlock(&xsrq_table->mutex);
return ret;
}
-void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
{
- int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
+ int index;
- pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ pthread_mutex_lock(&xsrq_table->mutex);
- if (!--ctx->xrc_srq_table[tind].refcnt)
- free(ctx->xrc_srq_table[tind].table);
+ if (--xsrq_table->xsrq_table[index].refcnt)
+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL;
else
- ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = NULL;
+ free(xsrq_table->xsrq_table[index].table);
+
+ pthread_mutex_unlock(&xsrq_table->mutex);
+}
+
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex)
+{
+ struct mlx4_create_xsrq cmd;
+ struct mlx4_create_srq_resp resp;
+ struct mlx4_srq *srq;
+ int ret;
- pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
+ /* Sanity check SRQ size before proceeding */
+ if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64)
+ return NULL;
+
+ srq = calloc(1, sizeof *srq);
+ if (!srq)
+ return NULL;
+
+ if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded))
+ goto err;
+
+ srq->max = align_queue_size(attr_ex->attr.max_wr + 1);
+ srq->max_gs = attr_ex->attr.max_sge;
+ srq->counter = 0;
+ srq->ext_srq = 1;
+
+ if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq))
+ goto err;
+
+ srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+ if (!srq->db)
+ goto err_free;
+
+ *srq->db = 0;
+
+ cmd.buf_addr = (uintptr_t) srq->buf.buf;
+ cmd.db_addr = (uintptr_t) srq->db;
+
+ ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, sizeof(srq->verbs_srq),
+ attr_ex,
+ &cmd.ibv_cmd, sizeof cmd,
+ &resp.ibv_resp, sizeof resp);
+ if (ret)
+ goto err_db;
+
+ ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table,
+ srq->verbs_srq.srq_num, srq);
+ if (ret)
+ goto err_destroy;
+
+ return &srq->verbs_srq.srq;
+
+err_destroy:
+ ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
+err_db:
+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db);
+err_free:
+ free(srq->wrid);
+ mlx4_free_buf(&srq->buf);
+err:
+ free(srq);
+ return NULL;
}
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq)
+{
+ struct mlx4_context *mctx = to_mctx(srq->context);
+ struct mlx4_srq *msrq = to_msrq(srq);
+ struct mlx4_cq *mcq;
+ int ret;
+
+ mcq = to_mcq(msrq->verbs_srq.cq);
+ mlx4_cq_clean(mcq, 0, msrq);
+ mlx4_lock(&mcq->lock);
+ mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num);
+ mlx4_unlock(&mcq->lock);
+
+ ret = ibv_cmd_destroy_srq(srq);
+ if (ret) {
+ mlx4_lock(&mcq->lock);
+ mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq);
+ mlx4_unlock(&mcq->lock);
+ return ret;
+ }
+
+ mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db);
+ mlx4_free_buf(&msrq->buf);
+ free(msrq->wrid);
+ free(msrq);
+
+ return 0;
+}
Index: contrib/ofed/libmlx4/src/verbs.c
===================================================================
--- contrib/ofed/libmlx4/src/verbs.c
+++ contrib/ofed/libmlx4/src/verbs.c
@@ -40,38 +40,130 @@
#include <pthread.h>
#include <errno.h>
#include <netinet/in.h>
-
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+/* Added for reg_mr mmap munmap system calls */
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sched.h>
+#include <glob.h>
#include "mlx4.h"
#include "mlx4-abi.h"
+#include "mlx4_exp.h"
#include "wqe.h"
+#define SHARED_MR_PROC_DIR_NAME "/proc/driver/mlx4_ib/mrs"
+#define FPATH_MAX 128
+
+int __mlx4_query_device(uint64_t raw_fw_ver,
+ struct ibv_device_attr *attr)
+{
+ unsigned major, minor, sub_minor;
+
+ major = (raw_fw_ver >> 32) & 0xffff;
+ minor = (raw_fw_ver >> 16) & 0xffff;
+ sub_minor = raw_fw_ver & 0xffff;
+
+ snprintf(attr->fw_ver, sizeof attr->fw_ver,
+ "%d.%d.%03d", major, minor, sub_minor);
+
+ return 0;
+}
+
int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
{
struct ibv_query_device cmd;
uint64_t raw_fw_ver;
- unsigned major, minor, sub_minor;
int ret;
- ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
+ read_init_vars(to_mctx(context));
+ ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd,
+ sizeof(cmd));
if (ret)
return ret;
- major = (raw_fw_ver >> 32) & 0xffff;
- minor = (raw_fw_ver >> 16) & 0xffff;
- sub_minor = raw_fw_ver & 0xffff;
+ return __mlx4_query_device(raw_fw_ver, attr);
+}
- snprintf(attr->fw_ver, sizeof attr->fw_ver,
- "%d.%d.%03d", major, minor, sub_minor);
+#define READL(ptr) (*((uint32_t *)(ptr)))
+
+static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles)
+{
+ unsigned int clockhi, clocklo, clockhi1;
+ int i;
+ struct mlx4_context *ctx = to_mctx(context);
+
+ if (ctx->hca_core_clock == NULL)
+ return -EOPNOTSUPP;
+
+ for (i = 0; i < 10; i++) {
+ clockhi = ntohl(READL(ctx->hca_core_clock));
+ clocklo = ntohl(READL(ctx->hca_core_clock + 4));
+ clockhi1 = ntohl(READL(ctx->hca_core_clock));
+ if (clockhi == clockhi1)
+ break;
+ }
+
+ if (clocklo == 0)
+ clockhi++;
+
+ *cycles = (uint64_t) clockhi << 32 | (uint64_t) clocklo;
return 0;
}
+int mlx4_query_values(struct ibv_context *context, int q_values,
+ struct ibv_exp_values *values)
+{
+ struct mlx4_context *ctx = to_mctx(context);
+ uint64_t cycles;
+ int err;
+ uint32_t comp_mask = values->comp_mask;
+
+ values->comp_mask = 0;
+
+ if (q_values & (IBV_EXP_VALUES_HW_CLOCK | IBV_EXP_VALUES_HW_CLOCK_NS)) {
+ err = mlx4_read_clock(context, &cycles);
+ if (!err) {
+ if (comp_mask & IBV_EXP_VALUES_HW_CLOCK) {
+ values->hwclock = cycles;
+ values->comp_mask |= IBV_EXP_VALUES_HW_CLOCK;
+ }
+ if (q_values & IBV_EXP_VALUES_HW_CLOCK_NS) {
+ if (comp_mask & IBV_EXP_VALUES_HW_CLOCK_NS) {
+ values->hwclock_ns =
+ ((uint64_t)values->hwclock *
+ ctx->core_clk.mult)
+ >> ctx->core_clk.shift;
+ values->comp_mask |= IBV_EXP_VALUES_HW_CLOCK_NS;
+ }
+ }
+ }
+ }
+ return 0;
+}
int mlx4_query_port(struct ibv_context *context, uint8_t port,
struct ibv_port_attr *attr)
{
struct ibv_query_port cmd;
+ int err;
+
+ read_init_vars(to_mctx(context));
+ err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
+ if (!err && port <= MLX4_PORTS_NUM && port > 0) {
+ struct mlx4_context *mctx = to_mctx(context);
+ if (!mctx->port_query_cache[port - 1].valid) {
+ mctx->port_query_cache[port - 1].link_layer =
+ attr->link_layer;
+ mctx->port_query_cache[port - 1].caps =
+ attr->port_cap_flags;
+ mctx->port_query_cache[port - 1].valid = 1;
+ }
+ }
- return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
+ return err;
}
struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context)
@@ -80,6 +172,7 @@
struct mlx4_alloc_pd_resp resp;
struct mlx4_pd *pd;
+ read_init_vars(to_mctx(context));
pd = malloc(sizeof *pd);
if (!pd)
return NULL;
@@ -107,50 +200,570 @@
return 0;
}
-struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
- enum ibv_access_flags access)
+
+static void mlx4_free_mr(struct mlx4_mr *mlx4_mr)
+{
+ /* mr address was allocated in speical mode - freed accordingly */
+ if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR ||
+ mlx4_mr->shared_mr)
+ mlx4_free_buf(&(mlx4_mr->buf));
+
+ /* Finally we free the structure itself */
+ free(mlx4_mr);
+}
+
+
+static void *mlx4_get_contiguous_alloc_fallback(struct mlx4_buf *buf,
+ struct ibv_pd *pd, size_t length)
+{
+
+ /* We allocate as fallback mode non contiguous pages*/
+ if (mlx4_alloc_buf(
+ buf,
+ align(length, to_mdev(pd->context->device)->page_size),
+ to_mdev(pd->context->device)->page_size))
+ return NULL;
+
+ return buf->buf;
+}
+
+
+/* We'll call mmap on mlx4_ib module to achieve this task */
+static void *mlx4_get_contiguous_alloc(struct mlx4_buf *mlx4_buf,
+ struct ibv_pd *pd,
+ size_t length,
+ void *contig_addr)
+{
+ size_t alloc_length;
+ int page_size;
+ int mr_no_allocator = 0;
+ int mr_force_contig_pages = 0;
+ enum mlx4_alloc_type alloc_type;
+
+ mlx4_get_alloc_type(pd->context, MLX4_MR_PREFIX, &alloc_type,
+ MLX4_ALLOC_TYPE_ALL);
+
+ if (alloc_type == MLX4_ALLOC_TYPE_CONTIG)
+ mr_force_contig_pages = 1;
+ else if (alloc_type == MLX4_ALLOC_TYPE_ANON)
+ mr_no_allocator = 1;
+
+ /* For benchmarking purposes we apply an option to turn off continuous
+ allocator based on environment variable
+ */
+ if (mr_no_allocator)
+ return mlx4_get_contiguous_alloc_fallback(mlx4_buf, pd,
+ length);
+
+ page_size = to_mdev(pd->context->device)->page_size;
+ alloc_length = (contig_addr ? length : align(length, page_size));
+ if (!(mlx4_alloc_buf_contig(to_mctx(pd->context),
+ mlx4_buf, alloc_length,
+ page_size, MLX4_MR_PREFIX, contig_addr)))
+ return contig_addr ? contig_addr : mlx4_buf->buf;
+
+ if (mr_force_contig_pages || contig_addr)
+ return NULL;
+
+ return mlx4_get_contiguous_alloc_fallback(mlx4_buf,
+ pd, length);
+
+}
+
+static int mlx4_get_shared_mr_name(char *in_pattern, char *file_name)
+{
+ glob_t results;
+ int ret;
+
+ ret = glob(in_pattern, 0, NULL, &results);
+
+ if (ret) {
+ if (mlx4_trace)
+ /* might be some legacy kernel with old mode */
+ fprintf(stderr, "mlx4_get_shared_mr_name: glob failed for %s, ret=%d, errno=%d\n",
+ in_pattern, ret, errno);
+ return ret;
+ }
+
+ if (results.gl_pathc > 1) {
+ int i;
+ int duplicate_name = 1;
+
+ /* we encountered an issue where glob retuned same name twice, we suspect it to be
+ * an issue with glob/procfs. When there is more than one entry check whether all entries
+ * are the same in that case API succeeded and we use first entry name.
+ */
+ for (i = 1; i < results.gl_pathc; i++) {
+ if (strcmp(results.gl_pathv[0], results.gl_pathv[i])) {
+ duplicate_name = 0;
+ break;
+ }
+ }
+
+ if (!duplicate_name) {
+ fprintf(stderr, "mlx4_get_shared_mr_name failed for %s, unexpected %lu paths were found\n",
+ in_pattern, (unsigned long)(results.gl_pathc));
+ for (i = 0; i < results.gl_pathc; i++)
+ fprintf(stderr, "mlx4_get_shared_mr_name: path#%d=%s\n", i,
+ results.gl_pathv[i]);
+ globfree(&results);
+ return -EINVAL;
+ }
+ }
+
+ strncpy(file_name, results.gl_pathv[0], FPATH_MAX);
+ file_name[FPATH_MAX - 1] = '\0';
+ globfree(&results);
+ return 0;
+}
+
+struct ibv_mr *mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in)
+{
+ struct ibv_context *context;
+ size_t total_size;
+ int page_size;
+ char shared_mr_file_name[FPATH_MAX];
+ char shared_mr_pattern[FPATH_MAX];
+ int fd;
+ struct stat buffer;
+ int status;
+ struct ibv_mr *ibv_mr;
+ uint64_t shared_flags;
+ struct mlx4_mr *mlx4_mr = NULL;
+ void *addr = in->addr;
+ uint64_t access = in->exp_access;
+ struct ibv_exp_reg_mr_in rmr_in;
+ int flags;
+ int ret;
+ int is_writeable_mr = !!(access & (IBV_EXP_ACCESS_REMOTE_WRITE |
+ IBV_EXP_ACCESS_LOCAL_WRITE | IBV_EXP_ACCESS_REMOTE_ATOMIC));
+
+ context = in->pd->context;
+ page_size = to_mdev(context->device)->page_size;
+ sprintf(shared_mr_pattern, "%s/%X.*",
+ SHARED_MR_PROC_DIR_NAME, in->mr_handle);
+
+ ret = mlx4_get_shared_mr_name(shared_mr_pattern, shared_mr_file_name);
+ if (ret)
+ /* For compatability issue trying with legacy name */
+ sprintf(shared_mr_file_name, "%s/%X",
+ SHARED_MR_PROC_DIR_NAME, in->mr_handle);
+
+ flags = is_writeable_mr ? O_RDWR : O_RDONLY;
+ fd = open(shared_mr_file_name, flags);
+ if (fd < 0) {
+ int counter = 10;
+ /* retrying for 1 second before reporting an error */
+ while (fd < 0 && counter > 0) {
+ usleep(100000);
+ counter--;
+ fd = open(shared_mr_file_name, flags);
+ }
+
+ if (fd < 0) {
+ fprintf(stderr, "mlx4_reg_shared_mr failed open %s errno=%d\n",
+ shared_mr_file_name, errno);
+ return NULL;
+ }
+ }
+
+ status = fstat(fd, &buffer);
+ if (status) {
+ fprintf(stderr,
+ "mlx4_reg_shared_mr lstat has failed , errno=%d\n",
+ errno);
+ goto error;
+ }
+
+ total_size = align(buffer.st_size, page_size);
+
+ /* set protection based on access flags input address may be NULL
+ or other recommended address by the application.
+ */
+ addr = mmap(addr , total_size,
+ is_writeable_mr ? (PROT_WRITE | PROT_READ) :
+ PROT_READ, MAP_SHARED,
+ fd,
+ 0);
+
+ /* On a failure MAP_FAILED (that is, (void *) -1) is returned*/
+ if (addr == MAP_FAILED) {
+ fprintf(stderr,
+ "mlx4_reg_shared_mr mmap has failed , errno=%d\n",
+ errno);
+ goto error;
+ }
+
+ if (ibv_dontfork_range(addr, total_size)) {
+ fprintf(stderr,
+ "mlx4_reg_shared_mr dontfork has failed , errno=%d\n",
+ errno);
+ goto err_unmap;
+ }
+
+ if (access & IBV_EXP_ACCESS_NO_RDMA) {
+ mlx4_mr = calloc(1, sizeof *mlx4_mr);
+ if (!mlx4_mr)
+ goto err_dofork;
+
+ mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_NO_RDMA;
+ ibv_mr = &(mlx4_mr->ibv_mr);
+ ibv_mr->context = in->pd->context;
+
+ } else {
+ /* Make sure that shared access flags are off before
+ calling to reg_mr, otherwise new mr will be shared as well.
+ */
+ shared_flags = IBV_EXP_ACCESS_SHARED_MR_USER_READ |
+ IBV_EXP_ACCESS_SHARED_MR_USER_WRITE |
+ IBV_EXP_ACCESS_SHARED_MR_GROUP_READ |
+ IBV_EXP_ACCESS_SHARED_MR_GROUP_WRITE |
+ IBV_EXP_ACCESS_SHARED_MR_OTHER_READ |
+ IBV_EXP_ACCESS_SHARED_MR_OTHER_WRITE;
+
+ access &= ~shared_flags;
+ rmr_in.pd = in->pd;
+ rmr_in.addr = addr;
+ rmr_in.length = total_size;
+ rmr_in.exp_access = access;
+ rmr_in.comp_mask = 0;
+
+ ibv_mr = mlx4_exp_reg_mr(&rmr_in);
+ if (!ibv_mr)
+ goto err_dofork;
+ }
+
+ /* file should be closed - not required any more */
+ close(fd);
+
+ ibv_mr->length = total_size;
+ ibv_mr->addr = addr;
+ mlx4_mr = to_mmr(ibv_mr);
+ /* We mark this MR as shared one to be handled correctly via dereg_mr*/
+ mlx4_mr->shared_mr = 1;
+ /* We hook addr & length also internally for further
+ use via dreg_mr.
+ */
+ mlx4_mr->buf.buf = addr;
+ mlx4_mr->buf.length = total_size;
+ return ibv_mr;
+
+err_dofork:
+ ibv_dofork_range(addr, total_size);
+err_unmap:
+ munmap(addr, total_size);
+error:
+ close(fd);
+ return NULL;
+}
+
+int mlx4_exp_dereg_mr(struct ibv_mr *mr, struct ibv_exp_dereg_out *out)
+{
+ struct mlx4_mr *mlx4_mr = to_mmr(mr);
+
+ out->need_dofork = (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR ||
+ mlx4_mr->shared_mr) ? 0 : 1;
+
+ return mlx4_dereg_mr(mr);
+}
+
+int mlx4_exp_rereg_mr(struct ibv_mr *mr,
+ int flags,
+ struct ibv_pd *pd, void *addr,
+ size_t length, uint64_t access,
+ struct ibv_exp_rereg_mr_attr *attr,
+ struct ibv_exp_rereg_out *out)
+{
+ struct mlx4_mr *mlx4_mr = to_mmr(mr);
+ struct mlx4_buf buf;
+ struct ibv_exp_rereg_mr cmd;
+ struct ibv_exp_rereg_mr_resp resp;
+ int internal_alloc = 0;
+ int ret;
+
+ if (flags & (~IBV_EXP_REREG_MR_FLAGS_SUPPORTED | IBV_EXP_REREG_MR_KEEP_VALID))
+ return -EINVAL;
+
+ /* Currently, we don't support any features in comp_mask */
+ if (attr->comp_mask)
+ return -EINVAL;
+
+ /* Here we check whether contigous pages are required and
+ should be allocated internally.
+ */
+
+ memset(&buf, 0, sizeof(buf));
+ if ((flags & IBV_EXP_REREG_MR_CHANGE_ACCESS) &&
+ !addr && (access & IBV_EXP_ACCESS_ALLOCATE_MR)) {
+ struct ibv_pd *curr_pd = flags & IBV_EXP_REREG_MR_CHANGE_PD ? pd : mr->pd;
+ addr = mlx4_get_contiguous_alloc(&buf, curr_pd, length, NULL);
+ if (!addr)
+ return -ENOMEM;
+
+ internal_alloc = 1;
+ }
+
+ ret = ibv_exp_cmd_rereg_mr(mr, flags, addr, length,
+ (uintptr_t) addr,
+ access, pd, attr,
+ &cmd, sizeof(cmd), 0,
+ &resp, sizeof(resp), 0);
+
+ if (ret) {
+ if (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION)
+ mlx4_free_buf(&buf);
+ return ret;
+ } else {
+ if (((mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR) ||
+ mlx4_mr->shared_mr) &&
+ (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION)) {
+ mlx4_mr->shared_mr = 0;
+ mlx4_free_buf(&(mlx4_mr->buf));
+ /* The memory was just freed, mark it as NULL */
+ mlx4_mr->ibv_mr.addr = NULL;
+ mlx4_mr->allocation_flags &= ~IBV_EXP_ACCESS_ALLOCATE_MR;
+ out->need_dofork = 0;
+ }
+ if (internal_alloc) {
+ mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_ALLOCATE_MR;
+ /* Address is returned to libibverbs through pointer to
+ * pointer mechanism
+ */
+ mlx4_mr->ibv_mr.addr = addr;
+ mlx4_mr->ibv_mr.length = length;
+ memcpy(&mlx4_mr->buf, &buf, sizeof(mlx4_mr->buf));
+ }
+ }
+
+ return ret;
+}
+
+
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+ struct ibv_xrcd_init_attr *attr)
+{
+ struct ibv_open_xrcd cmd;
+ struct ibv_open_xrcd_resp resp;
+ struct verbs_xrcd *xrcd;
+ int ret;
+
+ xrcd = calloc(1, sizeof *xrcd);
+ if (!xrcd)
+ return NULL;
+
+ ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
+ &cmd, sizeof cmd, &resp, sizeof resp);
+ if (ret)
+ goto err;
+
+ return &xrcd->xrcd;
+
+err:
+ free(xrcd);
+ return NULL;
+}
+
+int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
{
- struct ibv_mr *mr;
+ struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
+ int ret;
+
+ ret = ibv_cmd_close_xrcd(xrcd);
+ if (!ret)
+ free(xrcd);
+
+ return ret;
+}
+
+struct ibv_mr *mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in *in)
+{
+
+ struct mlx4_mr *mlx4_mr;
struct ibv_reg_mr cmd;
int ret;
+ int cmd_access;
+ int is_contig;
+
+ if ((in->comp_mask > IBV_EXP_REG_MR_RESERVED - 1) ||
+ (in->exp_access > IBV_EXP_ACCESS_RESERVED - 1)) {
+ errno = EINVAL;
+ return NULL;
+ }
- mr = malloc(sizeof *mr);
- if (!mr)
+ mlx4_mr = calloc(1, sizeof *mlx4_mr);
+ if (!mlx4_mr)
return NULL;
+ VALGRIND_MAKE_MEM_DEFINED(&in->create_flags, sizeof(in->create_flags));
+ is_contig = ((in->exp_access & IBV_EXP_ACCESS_ALLOCATE_MR) && !in->addr) ||
+ ((in->comp_mask & IBV_EXP_REG_MR_CREATE_FLAGS) &&
+ (in->create_flags & IBV_EXP_REG_MR_CREATE_CONTIG));
+ /* Here we check whether contigous pages are required and
+ should be allocated internally.
+ */
+ if (is_contig) {
+ in->addr = mlx4_get_contiguous_alloc(&mlx4_mr->buf, in->pd,
+ in->length, in->addr);
+ if (!in->addr) {
+ free(mlx4_mr);
+ return NULL;
+ }
+
+ mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_ALLOCATE_MR;
+ /* Hooking the addr on returned pointer for
+ further use by application.
+ */
+ mlx4_mr->ibv_mr.addr = in->addr;
+ }
+
+ cmd_access = (in->exp_access & (IBV_EXP_START_FLAG - 1)) |
+ (in->exp_access & (IBV_EXP_ACCESS_RESERVED - 1)) >> IBV_EXP_START_FLAG_LOC;
#ifdef IBV_CMD_REG_MR_HAS_RESP_PARAMS
{
struct ibv_reg_mr_resp resp;
- ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
- access, mr, &cmd, sizeof cmd,
- &resp, sizeof resp);
+ ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length,
+ (uintptr_t) in->addr, cmd_access,
+ &(mlx4_mr->ibv_mr),
+ &cmd, sizeof(cmd),
+ &resp, sizeof(resp));
}
#else
- ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, access, mr,
- &cmd, sizeof cmd);
+ ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length,
+ (uintptr_t) in->addr, cmd_access,
+ &(mlx4_mr->ibv_mr),
+ &cmd, sizeof(cmd));
#endif
if (ret) {
- free(mr);
+ mlx4_free_mr(mlx4_mr);
return NULL;
}
- return mr;
+ return &(mlx4_mr->ibv_mr);
+}
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
+ size_t length, int access)
+{
+ struct ibv_exp_reg_mr_in in;
+
+ in.pd = pd;
+ in.addr = addr;
+ in.length = length;
+ in.exp_access = access;
+ in.comp_mask = 0;
+
+ return mlx4_exp_reg_mr(&in);
}
int mlx4_dereg_mr(struct ibv_mr *mr)
{
int ret;
+ struct mlx4_mr *mlx4_mr = to_mmr(mr);
+
+ if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_NO_RDMA)
+ goto free_mr;
ret = ibv_cmd_dereg_mr(mr);
if (ret)
return ret;
+free_mr:
+ mlx4_free_mr(mlx4_mr);
+ return 0;
+}
+
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
+{
+ struct verbs_mw *vmw;
+ struct ibv_alloc_mw cmd;
+ struct ibv_alloc_mw_resp resp;
+ int ret;
+
+ vmw = malloc(sizeof(*vmw));
+ if (!vmw)
+ return NULL;
+ memset(vmw, 0, sizeof(*vmw));
+
+ ret = ibv_cmd_alloc_mw(pd, type, vmw, &cmd, sizeof(cmd),
+ &resp, sizeof(resp));
+
+ if (ret) {
+ free(vmw);
+ return NULL;
+ }
+ vmw->type = type;
+
+ return &vmw->mw;
+}
+
+int mlx4_dealloc_mw(struct ibv_mw *mw)
+{
+ int ret;
+ struct ibv_dealloc_mw cmd;
+ struct verbs_mw *vmw = (struct verbs_mw *)mw;
+
+ ret = ibv_cmd_dealloc_mw(vmw, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ free(vmw);
+ return 0;
+}
+
+int __mlx4_bind_mw(struct ibv_exp_mw_bind *mw_bind)
+{
+ int ret;
+ struct ibv_exp_send_wr *bad_wr = NULL;
+ struct ibv_exp_send_wr wr = { };
+
+ wr.exp_opcode = IBV_EXP_WR_BIND_MW;
+ wr.next = NULL;
+
+ wr.wr_id = mw_bind->wr_id;
+ wr.exp_send_flags = mw_bind->exp_send_flags;
+
+ wr.bind_mw.mw = mw_bind->mw;
+ wr.bind_mw.rkey = ibv_inc_rkey(mw_bind->mw->rkey);
+ wr.bind_mw.bind_info = mw_bind->bind_info;
+
+ ret = mlx4_exp_post_send(mw_bind->qp, &wr, &bad_wr);
+
+ if (ret)
+ return ret;
+
+ /* updating the mw with the latest rkey. */
+ mw_bind->mw->rkey = wr.bind_mw.rkey;
- free(mr);
return 0;
}
-static int align_queue_size(int req)
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+ struct ibv_mw_bind *mw_bind)
+{
+ struct ibv_exp_mw_bind exp_mw_bind;
+
+ memset(&exp_mw_bind, 0, sizeof(exp_mw_bind));
+ exp_mw_bind.qp = qp;
+ exp_mw_bind.exp_send_flags = mw_bind->send_flags;
+ exp_mw_bind.wr_id = mw_bind->wr_id;
+ exp_mw_bind.bind_info.addr = (uint64_t)(uintptr_t)mw_bind->addr;
+ exp_mw_bind.bind_info.length = mw_bind->length;
+ exp_mw_bind.bind_info.mr = mw_bind->mr;
+ exp_mw_bind.bind_info.exp_mw_access_flags = mw_bind->mw_access_flags;
+ exp_mw_bind.comp_mask = 0;
+
+ return __mlx4_bind_mw(&exp_mw_bind);
+
+}
+
+int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind)
+{
+ if (mw_bind->comp_mask > IBV_EXP_BIND_MW_RESERVED - 1)
+ return EINVAL;
+ return __mlx4_bind_mw(mw_bind);
+}
+
+int align_queue_size(int req)
{
int nent;
@@ -160,36 +773,52 @@
return nent;
}
-struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
- struct ibv_comp_channel *channel,
- int comp_vector)
+static struct ibv_cq *create_cq(struct ibv_context *context,
+ int cqe,
+ struct ibv_comp_channel *channel,
+ int comp_vector,
+ struct ibv_exp_cq_init_attr *attr)
{
- struct mlx4_create_cq cmd;
- struct mlx4_create_cq_resp resp;
- struct mlx4_cq *cq;
- int ret;
- struct mlx4_context *mctx = to_mctx(context);
+ struct mlx4_create_cq cmd;
+ struct mlx4_exp_create_cq cmd_e;
+ struct mlx4_create_cq_resp resp;
+ struct mlx4_cq *cq;
+ int ret;
+ struct mlx4_context *mctx = to_mctx(context);
+ int thread_safe;
/* Sanity check CQ size before proceeding */
if (cqe > 0x3fffff)
return NULL;
- cq = malloc(sizeof *cq);
+ cq = calloc(1, sizeof(*cq));
if (!cq)
return NULL;
cq->cons_index = 0;
+ cq->wait_index = 0;
+ cq->wait_count = 0;
- if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
+ thread_safe = !mlx4_single_threaded;
+ if (attr && (attr->comp_mask & IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN)) {
+ if (!attr->res_domain) {
+ errno = EINVAL;
+ goto err;
+ }
+ thread_safe = (to_mres_domain(attr->res_domain)->attr.thread_model == IBV_EXP_THREAD_SAFE);
+ }
+
+ if (mlx4_lock_init(&cq->lock, thread_safe, mlx4_get_locktype()))
goto err;
+ cq->model_flags = thread_safe ? MLX4_CQ_MODEL_FLAG_THREAD_SAFE : 0;
+
cqe = align_queue_size(cqe + 1);
- if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe, mctx->cqe_size))
+ if (mlx4_alloc_cq_buf(to_mctx(context), &cq->buf, cqe, mctx->cqe_size))
goto err;
cq->cqe_size = mctx->cqe_size;
-
cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
if (!cq->set_ci_db)
goto err_buf;
@@ -199,16 +828,41 @@
cq->arm_sn = 1;
*cq->set_ci_db = 0;
- cmd.buf_addr = (uintptr_t) cq->buf.buf;
- cmd.db_addr = (uintptr_t) cq->set_ci_db;
-
- ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector,
- &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd,
- &resp.ibv_resp, sizeof resp);
+ if (NULL != attr) {
+ cmd_e.buf_addr = (uintptr_t) cq->buf.buf;
+ cmd_e.db_addr = (uintptr_t) cq->set_ci_db;
+ } else {
+ cmd.buf_addr = (uintptr_t) cq->buf.buf;
+ cmd.db_addr = (uintptr_t) cq->set_ci_db;
+ }
+ if (NULL != attr) {
+ ret = ibv_exp_cmd_create_cq(context, cqe - 1, channel,
+ comp_vector, &cq->ibv_cq,
+ &cmd_e.ibv_cmd,
+ sizeof(cmd_e.ibv_cmd),
+ sizeof(cmd_e) - sizeof(cmd_e.ibv_cmd),
+ &resp.ibv_resp,
+ sizeof(resp.ibv_resp),
+ sizeof(resp) - sizeof(resp.ibv_resp),
+ attr);
+ } else {
+ ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector,
+ &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd,
+ &resp.ibv_resp, sizeof(resp));
+ }
if (ret)
goto err_db;
cq->cqn = resp.cqn;
+ cq->stall_next_poll = 0;
+ cq->stall_enable = mctx->stall_enable;
+ if (NULL != attr && attr->comp_mask) {
+ if (cmd_e.ibv_cmd.comp_mask & IBV_EXP_CREATE_CQ_CAP_FLAGS) {
+ cq->creation_flags = attr->flags;
+ }
+ }
+
+ cq->pattern = MLX4_CQ_PATTERN;
return &cq->ibv_cq;
@@ -216,14 +870,41 @@
mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);
err_buf:
- mlx4_free_buf(&cq->buf);
-
+ if (cq->buf.hmem != NULL)
+ mlx4_free_buf_huge(to_mctx(context), &cq->buf);
+ else
+ mlx4_free_buf(&cq->buf);
err:
free(cq);
return NULL;
}
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+ struct ibv_comp_channel *channel,
+ int comp_vector)
+{
+ read_init_vars(to_mctx(context));
+ return create_cq(context, cqe, channel, comp_vector, NULL);
+}
+
+struct ibv_cq *mlx4_create_cq_ex(struct ibv_context *context,
+ int cqe,
+ struct ibv_comp_channel *channel,
+ int comp_vector,
+ struct ibv_exp_cq_init_attr *attr)
+{
+ return create_cq(context, cqe, channel, comp_vector, attr);
+}
+
+int mlx4_modify_cq(struct ibv_cq *cq,
+ struct ibv_exp_cq_attr *attr,
+ int attr_mask)
+{
+ struct ibv_exp_modify_cq cmd;
+ return ibv_exp_cmd_modify_cq(cq, attr, attr_mask, &cmd, sizeof(cmd));
+}
+
int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
{
struct mlx4_cq *cq = to_mcq(ibcq);
@@ -235,7 +916,7 @@
if (cqe > 0x3fffff)
return EINVAL;
- pthread_spin_lock(&cq->lock);
+ mlx4_lock(&cq->lock);
cqe = align_queue_size(cqe + 1);
if (cqe == ibcq->cqe + 1) {
@@ -250,7 +931,7 @@
goto out;
}
- ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe,
+ ret = mlx4_alloc_cq_buf(to_mctx(ibcq->context), &buf, cqe,
cq->cqe_size);
if (ret)
goto out;
@@ -268,17 +949,24 @@
ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
#endif
if (ret) {
- mlx4_free_buf(&buf);
+ if (cq->buf.hmem != NULL)
+ mlx4_free_buf_huge(to_mctx(ibcq->context), &buf);
+ else
+ mlx4_free_buf(&buf);
goto out;
}
mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
- mlx4_free_buf(&cq->buf);
- cq->buf = buf;
+ if (cq->buf.hmem != NULL)
+ mlx4_free_buf_huge(to_mctx(ibcq->context), &cq->buf);
+ else
+ mlx4_free_buf(&cq->buf);
+ cq->buf = buf;
+ mlx4_update_cons_index(cq);
out:
- pthread_spin_unlock(&cq->lock);
+ mlx4_unlock(&cq->lock);
return ret;
}
@@ -291,14 +979,32 @@
return ret;
mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
- mlx4_free_buf(&to_mcq(cq)->buf);
+ if (to_mcq(cq)->buf.hmem != NULL)
+ mlx4_free_buf_huge(to_mctx(cq->context), &to_mcq(cq)->buf);
+ else
+ mlx4_free_buf(&to_mcq(cq)->buf);
free(to_mcq(cq));
return 0;
}
+void *mlx4_get_legacy_xrc(struct ibv_srq *srq)
+{
+ struct mlx4_srq *msrq = to_msrq(srq);
+
+ return msrq->ibv_srq_legacy;
+}
+
+void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq)
+{
+ struct mlx4_srq *msrq = to_msrq(srq);
+
+ msrq->ibv_srq_legacy = legacy_xrc_srq;
+ return;
+}
+
struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
- struct ibv_srq_init_attr *attr)
+ struct ibv_srq_init_attr *attr)
{
struct mlx4_create_srq cmd;
struct mlx4_create_srq_resp resp;
@@ -309,16 +1015,17 @@
if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
return NULL;
- srq = malloc(sizeof *srq);
+ srq = calloc(1, sizeof *srq);
if (!srq)
return NULL;
- if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+ if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded))
goto err;
srq->max = align_queue_size(attr->attr.max_wr + 1);
srq->max_gs = attr->attr.max_sge;
srq->counter = 0;
+ srq->ext_srq = 0;
if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
goto err;
@@ -332,15 +1039,13 @@
cmd.buf_addr = (uintptr_t) srq->buf.buf;
cmd.db_addr = (uintptr_t) srq->db;
- ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr,
+ ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
&cmd.ibv_cmd, sizeof cmd,
&resp.ibv_resp, sizeof resp);
if (ret)
goto err_db;
- srq->srqn = resp.srqn;
-
- return &srq->ibv_srq;
+ return &srq->verbs_srq.srq;
err_db:
mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
@@ -355,12 +1060,27 @@
return NULL;
}
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex)
+{
+ if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
+ (attr_ex->srq_type == IBV_SRQT_BASIC))
+ return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
+ else if (attr_ex->srq_type == IBV_SRQT_XRC)
+ return mlx4_create_xrc_srq(context, attr_ex);
+
+ return NULL;
+}
+
int mlx4_modify_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr,
- enum ibv_srq_attr_mask attr_mask)
+ int attr_mask)
{
struct ibv_modify_srq cmd;
+ if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
+ srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
+
return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
}
@@ -369,199 +1089,98 @@
{
struct ibv_query_srq cmd;
+ if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
+ srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
+
return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
}
-int mlx4_destroy_srq(struct ibv_srq *ibsrq)
+int mlx4_destroy_srq(struct ibv_srq *srq)
{
- struct mlx4_srq *srq = to_msrq(ibsrq);
- struct mlx4_cq *mcq = NULL;
int ret;
+ struct ibv_srq *legacy_srq = NULL;
- if (ibsrq->xrc_cq) {
- /* is an xrc_srq */
- mcq = to_mcq(ibsrq->xrc_cq);
- mlx4_cq_clean(mcq, 0, srq);
- pthread_spin_lock(&mcq->lock);
- mlx4_clear_xrc_srq(to_mctx(ibsrq->context), srq->srqn);
- pthread_spin_unlock(&mcq->lock);
+ if (srq->handle == LEGACY_XRC_SRQ_HANDLE) {
+ legacy_srq = srq;
+ srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
}
- ret = ibv_cmd_destroy_srq(ibsrq);
- if (ret) {
- if (ibsrq->xrc_cq) {
- pthread_spin_lock(&mcq->lock);
- mlx4_store_xrc_srq(to_mctx(ibsrq->context),
- srq->srqn, srq);
- pthread_spin_unlock(&mcq->lock);
- }
- return ret;
+ if (to_msrq(srq)->ext_srq) {
+ ret = mlx4_destroy_xrc_srq(srq);
+ if (ret)
+ return ret;
+
+ if (legacy_srq)
+ free(legacy_srq);
+
+ return 0;
}
- mlx4_free_db(to_mctx(ibsrq->context), MLX4_DB_TYPE_RQ, srq->db);
- mlx4_free_buf(&srq->buf);
- free(srq->wrid);
- free(srq);
+ ret = ibv_cmd_destroy_srq(srq);
+ if (ret)
+ return ret;
+
+ mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
+ mlx4_free_buf(&to_msrq(srq)->buf);
+ free(to_msrq(srq)->wrid);
+ free(to_msrq(srq));
return 0;
}
-static int verify_sizes(struct ibv_qp_init_attr *attr, struct mlx4_context *context)
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+ struct ibv_qp_init_attr_ex *attr)
{
- int size;
- int nsegs;
-
- if (attr->cap.max_send_wr > context->max_qp_wr ||
- attr->cap.max_recv_wr > context->max_qp_wr ||
- attr->cap.max_send_sge > context->max_sge ||
- attr->cap.max_recv_sge > context->max_sge)
- return -1;
-
- if (attr->cap.max_inline_data) {
- nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type);
- size = MLX4_MAX_WQE_SIZE - nsegs * sizeof (struct mlx4_wqe_inline_seg);
- switch (attr->qp_type) {
- case IBV_QPT_UD:
- size -= (sizeof (struct mlx4_wqe_ctrl_seg) +
- sizeof (struct mlx4_wqe_datagram_seg));
- break;
-
- case IBV_QPT_RC:
- case IBV_QPT_UC:
- case IBV_QPT_XRC:
- size -= (sizeof (struct mlx4_wqe_ctrl_seg) +
- sizeof (struct mlx4_wqe_raddr_seg));
- break;
-
- default:
- return 0;
- }
-
- if (attr->cap.max_inline_data > size)
- return -1;
- }
-
- return 0;
+ read_init_vars(to_mctx(context));
+ return mlx4_exp_create_qp(context, (struct ibv_exp_qp_init_attr *)attr);
}
struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
{
- struct mlx4_create_qp cmd;
- struct ibv_create_qp_resp resp;
- struct mlx4_qp *qp;
- int ret;
- struct mlx4_context *context = to_mctx(pd->context);
-
+ struct ibv_exp_qp_init_attr attr_exp;
+ struct ibv_qp *qp;
+ /* We should copy below only the shared fields excluding the xrc_domain field.
+ * Otherwise we may have an ABI issue with applications that were compiled
+ * without the xrc_domain field. The xrc_domain any way has no affect in
+ * the sender side, no need to copy in/out.
+ */
+ int init_attr_base_size = offsetof(struct ibv_qp_init_attr, xrc_domain);
+
+ /* copying only shared fields */
+ memcpy(&attr_exp, attr, init_attr_base_size);
+ attr_exp.comp_mask = IBV_EXP_QP_INIT_ATTR_PD;
+ attr_exp.pd = pd;
+ qp = mlx4_exp_create_qp(pd->context, &attr_exp);
+ if (qp)
+ memcpy(attr, &attr_exp, init_attr_base_size);
+ return qp;
+}
- /* Sanity check QP size before proceeding */
- if (verify_sizes(attr, context))
- return NULL;
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr)
+{
+ struct ibv_open_qp cmd;
+ struct ibv_create_qp_resp resp;
+ struct mlx4_qp *qp;
+ int ret;
- qp = malloc(sizeof *qp);
+ qp = calloc(1, sizeof *qp);
if (!qp)
return NULL;
- mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
-
- /*
- * We need to leave 2 KB + 1 WQE of headroom in the SQ to
- * allow HW to prefetch.
- */
- qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
- qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
- qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
-
- if (attr->srq || attr->qp_type == IBV_QPT_XRC)
- attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
- else {
- if (attr->cap.max_recv_sge < 1)
- attr->cap.max_recv_sge = 1;
- if (attr->cap.max_recv_wr < 1)
- attr->cap.max_recv_wr = 1;
- }
-
- if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp))
- goto err;
-
- mlx4_init_qp_indices(qp);
-
- if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) ||
- pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
- goto err_free;
-
- if (!attr->srq && attr->qp_type != IBV_QPT_XRC) {
- qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
- if (!qp->db)
- goto err_free;
-
- *qp->db = 0;
- }
-
- cmd.buf_addr = (uintptr_t) qp->buf.buf;
- if (attr->srq || attr->qp_type == IBV_QPT_XRC)
- cmd.db_addr = 0;
- else
- cmd.db_addr = (uintptr_t) qp->db;
- cmd.log_sq_stride = qp->sq.wqe_shift;
- for (cmd.log_sq_bb_count = 0;
- qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
- ++cmd.log_sq_bb_count)
- ; /* nothing */
- cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */
- memset(cmd.reserved, 0, sizeof cmd.reserved);
-
- pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex);
-
- ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd,
- &resp, sizeof resp);
- if (ret)
- goto err_rq_db;
-
- ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp);
+ ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
+ &cmd, sizeof cmd, &resp, sizeof resp);
if (ret)
- goto err_destroy;
- pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
-
- qp->rq.wqe_cnt = attr->cap.max_recv_wr;
- qp->rq.max_gs = attr->cap.max_recv_sge;
-
- /* adjust rq maxima to not exceed reported device maxima */
- attr->cap.max_recv_wr = min(context->max_qp_wr, attr->cap.max_recv_wr);
- attr->cap.max_recv_sge = min(context->max_sge, attr->cap.max_recv_sge);
-
- qp->rq.max_post = attr->cap.max_recv_wr;
- mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
-
- qp->doorbell_qpn = htonl(qp->ibv_qp.qp_num << 8);
- if (attr->sq_sig_all)
- qp->sq_signal_bits = htonl(MLX4_WQE_CTRL_CQ_UPDATE);
- else
- qp->sq_signal_bits = 0;
-
- return &qp->ibv_qp;
-
-err_destroy:
- ibv_cmd_destroy_qp(&qp->ibv_qp);
-
-err_rq_db:
- pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
- if (!attr->srq && attr->qp_type != IBV_QPT_XRC)
- mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db);
+ goto err;
-err_free:
- free(qp->sq.wrid);
- if (qp->rq.wqe_cnt)
- free(qp->rq.wrid);
- mlx4_free_buf(&qp->buf);
+ return &qp->verbs_qp.qp;
err:
free(qp);
-
return NULL;
}
int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
- enum ibv_qp_attr_mask attr_mask,
+ int attr_mask,
struct ibv_qp_init_attr *init_attr)
{
struct ibv_query_qp cmd;
@@ -582,11 +1201,17 @@
}
int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
- enum ibv_qp_attr_mask attr_mask)
+ int attr_mask)
{
struct ibv_modify_qp cmd;
int ret;
+ if (attr_mask & IBV_QP_PORT) {
+ ret = update_port_data(qp, attr->port_num);
+ if (ret)
+ return ret;
+ }
+
if (qp->state == IBV_QPS_RESET &&
attr_mask & IBV_QP_STATE &&
attr->qp_state == IBV_QPS_INIT) {
@@ -598,13 +1223,14 @@
if (!ret &&
(attr_mask & IBV_QP_STATE) &&
attr->qp_state == IBV_QPS_RESET) {
- mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
- qp->srq ? to_msrq(qp->srq) : NULL);
- if (qp->send_cq != qp->recv_cq)
+ if (qp->recv_cq)
+ mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+ qp->srq ? to_msrq(qp->srq) : NULL);
+ if (qp->send_cq && qp->send_cq != qp->recv_cq)
mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
mlx4_init_qp_indices(to_mqp(qp));
- if (!qp->srq && qp->qp_type != IBV_QPT_XRC)
+ if (to_mqp(qp)->rq.wqe_cnt)
*to_mqp(qp)->db = 0;
}
@@ -616,14 +1242,19 @@
struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
- if (send_cq == recv_cq)
- pthread_spin_lock(&send_cq->lock);
- else if (send_cq->cqn < recv_cq->cqn) {
- pthread_spin_lock(&send_cq->lock);
- pthread_spin_lock(&recv_cq->lock);
+ if (!qp->send_cq || !qp->recv_cq) {
+ if (qp->send_cq)
+ mlx4_lock(&send_cq->lock);
+ else if (qp->recv_cq)
+ mlx4_lock(&recv_cq->lock);
+ } else if (send_cq == recv_cq) {
+ mlx4_lock(&send_cq->lock);
+ } else if (send_cq->cqn < recv_cq->cqn) {
+ mlx4_lock(&send_cq->lock);
+ mlx4_lock(&recv_cq->lock);
} else {
- pthread_spin_lock(&recv_cq->lock);
- pthread_spin_lock(&send_cq->lock);
+ mlx4_lock(&recv_cq->lock);
+ mlx4_lock(&send_cq->lock);
}
}
@@ -632,14 +1263,20 @@
struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
- if (send_cq == recv_cq)
- pthread_spin_unlock(&send_cq->lock);
- else if (send_cq->cqn < recv_cq->cqn) {
- pthread_spin_unlock(&recv_cq->lock);
- pthread_spin_unlock(&send_cq->lock);
+
+ if (!qp->send_cq || !qp->recv_cq) {
+ if (qp->send_cq)
+ mlx4_unlock(&send_cq->lock);
+ else if (qp->recv_cq)
+ mlx4_unlock(&recv_cq->lock);
+ } else if (send_cq == recv_cq) {
+ mlx4_unlock(&send_cq->lock);
+ } else if (send_cq->cqn < recv_cq->cqn) {
+ mlx4_unlock(&recv_cq->lock);
+ mlx4_unlock(&send_cq->lock);
} else {
- pthread_spin_unlock(&send_cq->lock);
- pthread_spin_unlock(&recv_cq->lock);
+ mlx4_unlock(&send_cq->lock);
+ mlx4_unlock(&recv_cq->lock);
}
}
@@ -656,246 +1293,120 @@
}
mlx4_lock_cqs(ibqp);
-
- __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
- ibqp->srq ? to_msrq(ibqp->srq) : NULL);
- if (ibqp->send_cq != ibqp->recv_cq)
+ if (ibqp->recv_cq)
+ __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
+ ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+ if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
- mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
+ if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
+ mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
mlx4_unlock_cqs(ibqp);
pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
- if (!ibqp->srq && ibqp->qp_type != IBV_QPT_XRC)
- mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
- free(qp->sq.wrid);
+ /*
+ * Use the qp->bf to check if the QP is using dedicated BF.
+ * If so, update the dedicated BF database.
+ */
+ if (qp->bf && (&qp->bf->cmn != &(to_mctx(ibqp->context)->bfs.cmn_bf))) {
+ struct mlx4_bfs_data *bfs = &to_mctx(ibqp->context)->bfs;
+ int idx = &(qp->bf->dedic) - bfs->dedic_bf;
+
+ if (0 <= idx && idx < (MLX4_MAX_BFS_IN_PAGE - 1)) {
+ mlx4_spin_lock(&bfs->dedic_bf_lock);
+ bfs->dedic_bf_used[idx] = 0;
+ bfs->dedic_bf_free++;
+ mlx4_spin_unlock(&bfs->dedic_bf_lock);
+ }
+ }
+
if (qp->rq.wqe_cnt)
- free(qp->rq.wrid);
- mlx4_free_buf(&qp->buf);
+ mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
+
+ mlx4_dealloc_qp_buf(ibqp->context, qp);
+
free(qp);
return 0;
}
-struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
+struct ibv_ah *mlx4_create_ah_common(struct ibv_pd *pd,
+ struct ibv_ah_attr *attr,
+ uint8_t link_layer)
{
struct mlx4_ah *ah;
- struct ibv_port_attr port_attr;
- uint8_t is_mcast;
+
+ if (unlikely(!attr->dlid) &&
+ (link_layer != IBV_LINK_LAYER_ETHERNET)) {
+ errno = EINVAL;
+ return NULL;
+ }
ah = malloc(sizeof *ah);
if (!ah)
return NULL;
- memset(ah, 0, sizeof *ah);
+ memset(&ah->av, 0, sizeof ah->av);
ah->av.port_pd = htonl(to_mpd(pd)->pdn | (attr->port_num << 24));
- ah->av.g_slid = attr->src_path_bits;
- ah->av.dlid = htons(attr->dlid);
+
+ if (link_layer != IBV_LINK_LAYER_ETHERNET) {
+ ah->av.g_slid = attr->src_path_bits;
+ ah->av.dlid = htons(attr->dlid);
+ ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28);
+ } else {
+ ah->vlan = ((attr->sl & 7) << 13);
+ ah->av.sl_tclass_flowlabel = htonl(attr->sl << 29);
+ }
+
if (attr->static_rate) {
ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
/* XXX check rate cap? */
}
- ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28);
if (attr->is_global) {
ah->av.g_slid |= 0x80;
ah->av.gid_index = attr->grh.sgid_index;
- ah->av.hop_limit = attr->grh.hop_limit;
+ if (attr->grh.hop_limit < 2)
+ ah->av.hop_limit = 0xff;
+ else
+ ah->av.hop_limit = attr->grh.hop_limit;
ah->av.sl_tclass_flowlabel |=
htonl((attr->grh.traffic_class << 20) |
attr->grh.flow_label);
memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
}
- if (ibv_query_port(pd->context, attr->port_num, &port_attr))
- goto err;
-
- if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
- if (ibv_resolve_eth_gid(pd, attr->port_num,
- (union ibv_gid *)ah->av.dgid,
- attr->grh.sgid_index,
- ah->mac, &ah->vlan,
- &ah->tagged, &is_mcast))
- goto err;
-
- if (is_mcast) {
- ah->av.dlid = htons(0xc000);
- ah->av.port_pd |= htonl(1 << 31);
- }
- if (ah->tagged) {
- ah->av.port_pd |= htonl(1 << 29);
- ah->vlan |= (attr->sl & 7) << 13;
- }
- }
-
-
return &ah->ibv_ah;
-err:
- free(ah);
- return NULL;
}
-int mlx4_destroy_ah(struct ibv_ah *ah)
-{
- free(to_mah(ah));
-
- return 0;
-}
-
-#ifdef HAVE_IBV_XRC_OPS
-struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
- struct ibv_xrc_domain *xrc_domain,
- struct ibv_cq *xrc_cq,
- struct ibv_srq_init_attr *attr)
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
{
- struct mlx4_create_xrc_srq cmd;
- struct mlx4_create_srq_resp resp;
- struct mlx4_srq *srq;
- int ret;
-
- /* Sanity check SRQ size before proceeding */
- if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
- return NULL;
-
- srq = malloc(sizeof *srq);
- if (!srq)
- return NULL;
-
- if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
- goto err;
-
- srq->max = align_queue_size(attr->attr.max_wr + 1);
- srq->max_gs = attr->attr.max_sge;
- srq->counter = 0;
-
- if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
- goto err;
-
- srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
- if (!srq->db)
- goto err_free;
-
- *srq->db = 0;
-
- cmd.buf_addr = (uintptr_t) srq->buf.buf;
- cmd.db_addr = (uintptr_t) srq->db;
-
- ret = ibv_cmd_create_xrc_srq(pd, &srq->ibv_srq, attr,
- xrc_domain->handle,
- xrc_cq->handle,
- &cmd.ibv_cmd, sizeof cmd,
- &resp.ibv_resp, sizeof resp);
- if (ret)
- goto err_db;
+ struct ibv_ah *ah;
+ struct ibv_exp_port_attr port_attr;
+ struct ibv_port_attr port_attr_legacy;
+ uint8_t link_layer;
- srq->ibv_srq.xrc_srq_num = srq->srqn = resp.srqn;
+ port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1;
+ port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER;
- ret = mlx4_store_xrc_srq(to_mctx(pd->context), srq->ibv_srq.xrc_srq_num, srq);
- if (ret)
- goto err_destroy;
-
- return &srq->ibv_srq;
-
-err_destroy:
- ibv_cmd_destroy_srq(&srq->ibv_srq);
-
-err_db:
- mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
-
-err_free:
- free(srq->wrid);
- mlx4_free_buf(&srq->buf);
+ if (ibv_exp_query_port(pd->context, attr->port_num, &port_attr)) {
+ if (ibv_query_port(pd->context, attr->port_num, &port_attr_legacy))
+ return NULL;
-err:
- free(srq);
-
- return NULL;
-}
-
-struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
- int fd, int oflag)
-{
- int ret;
- struct mlx4_open_xrc_domain_resp resp;
- struct mlx4_xrc_domain *xrcd;
-
- xrcd = malloc(sizeof *xrcd);
- if (!xrcd)
- return NULL;
-
- ret = ibv_cmd_open_xrc_domain(context, fd, oflag, &xrcd->ibv_xrcd,
- &resp.ibv_resp, sizeof resp);
- if (ret) {
- free(xrcd);
- return NULL;
+ link_layer = port_attr_legacy.link_layer;
+ } else {
+ link_layer = port_attr.link_layer;
}
- xrcd->xrcdn = resp.xrcdn;
- return &xrcd->ibv_xrcd;
-}
-
-int mlx4_close_xrc_domain(struct ibv_xrc_domain *d)
-{
- int ret;
- ret = ibv_cmd_close_xrc_domain(d);
- if (!ret)
- free(d);
- return ret;
-}
-
-int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr,
- uint32_t *xrc_qp_num)
-{
-
- return ibv_cmd_create_xrc_rcv_qp(init_attr, xrc_qp_num);
-}
+ ah = mlx4_create_ah_common(pd, attr, link_layer);
-int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num,
- struct ibv_qp_attr *attr,
- int attr_mask)
-{
- return ibv_cmd_modify_xrc_rcv_qp(xrc_domain, xrc_qp_num,
- attr, attr_mask);
+ return ah;
}
-int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num,
- struct ibv_qp_attr *attr,
- int attr_mask,
- struct ibv_qp_init_attr *init_attr)
+int mlx4_destroy_ah(struct ibv_ah *ah)
{
- int ret;
-
- ret = ibv_cmd_query_xrc_rcv_qp(xrc_domain, xrc_qp_num,
- attr, attr_mask, init_attr);
- if (ret)
- return ret;
-
- init_attr->cap.max_send_wr = init_attr->cap.max_send_sge = 1;
- init_attr->cap.max_recv_sge = init_attr->cap.max_recv_wr = 0;
- init_attr->cap.max_inline_data = 0;
- init_attr->recv_cq = init_attr->send_cq = NULL;
- init_attr->srq = NULL;
- init_attr->xrc_domain = xrc_domain;
- init_attr->qp_type = IBV_QPT_XRC;
- init_attr->qp_context = NULL;
- attr->cap = init_attr->cap;
+ free(to_mah(ah));
return 0;
}
-
-int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num)
-{
- return ibv_cmd_reg_xrc_rcv_qp(xrc_domain, xrc_qp_num);
-}
-
-int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
- uint32_t xrc_qp_num)
-{
- return ibv_cmd_unreg_xrc_rcv_qp(xrc_domain, xrc_qp_num);
-}
-
-#endif
Index: contrib/ofed/libmlx4/src/verbs_exp.c
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/verbs_exp.c
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+/* Added for reg_mr mmap munmap system calls */
+#include <sys/mman.h>
+#include "mlx4.h"
+#include "mlx4-abi.h"
+#include "mlx4_exp.h"
+#include "wqe.h"
+
+static const char *qptype2key(enum ibv_qp_type type)
+{
+ switch (type) {
+ case IBV_QPT_RC: return "HUGE_RC";
+ case IBV_QPT_UC: return "HUGE_UC";
+ case IBV_QPT_UD: return "HUGE_UD";
+#ifdef _NOT_EXISTS_IN_OFED_2_0
+ case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH";
+#endif
+
+ default: return "HUGE_NA";
+ }
+}
+
+static void update_qp_cap_cache(struct ibv_qp *qp)
+{
+ struct mlx4_context *ctx = to_mctx(qp->context);
+ struct mlx4_qp *mqp = to_mqp(qp);
+
+ if (((qp->qp_type == IBV_QPT_RAW_ETH) && (mqp->link_layer == IBV_LINK_LAYER_ETHERNET)) &&
+ (ctx->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_IP_PKT))
+ mqp->qp_cap_cache |= MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP;
+}
+
+int update_port_data(struct ibv_qp *qp, uint8_t port_num)
+{
+ struct mlx4_qp *mqp = to_mqp(qp);
+ struct ibv_port_attr port_attr;
+ int err;
+
+ err = ibv_query_port(qp->context, port_num, &port_attr);
+ if (err)
+ return err;
+
+ mqp->link_layer = port_attr.link_layer;
+ update_qp_cap_cache(qp);
+
+ return 0;
+}
+
+int mlx4_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr,
+ uint64_t attr_mask)
+{
+ struct ibv_exp_modify_qp cmd;
+ int ret;
+
+ memset(&cmd, 0, sizeof(cmd));
+ if (attr_mask & IBV_QP_PORT) {
+ ret = update_port_data(qp, attr->port_num);
+ if (ret)
+ return ret;
+ }
+
+ if (qp->state == IBV_QPS_RESET &&
+ (attr_mask & IBV_EXP_QP_STATE) &&
+ attr->qp_state == IBV_QPS_INIT) {
+ mlx4_qp_init_sq_ownership(to_mqp(qp));
+ }
+
+
+ ret = ibv_exp_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd));
+
+ if (!ret &&
+ (attr_mask & IBV_EXP_QP_STATE) &&
+ attr->qp_state == IBV_QPS_RESET) {
+ if (qp->recv_cq)
+ mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+ qp->srq ? to_msrq(qp->srq) : NULL);
+ if (qp->send_cq && qp->send_cq != qp->recv_cq)
+ mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
+
+ mlx4_init_qp_indices(to_mqp(qp));
+ if (to_mqp(qp)->rq.wqe_cnt)
+ *to_mqp(qp)->db = 0;
+ }
+
+ return ret;
+}
+
+static int verify_sizes(struct ibv_exp_qp_init_attr *attr, struct mlx4_context *context)
+{
+ int size;
+ int nsegs;
+
+ if (attr->cap.max_send_wr > context->max_qp_wr ||
+ attr->cap.max_recv_wr > context->max_qp_wr ||
+ attr->cap.max_send_sge > context->max_sge ||
+ attr->cap.max_recv_sge > context->max_sge)
+ return -1;
+
+ if (attr->cap.max_inline_data) {
+ nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type);
+ size = MLX4_MAX_WQE_SIZE - nsegs * sizeof(struct mlx4_wqe_inline_seg);
+ switch (attr->qp_type) {
+ case IBV_QPT_UD:
+ size -= (sizeof(struct mlx4_wqe_ctrl_seg) +
+ sizeof(struct mlx4_wqe_datagram_seg));
+ break;
+
+ case IBV_QPT_RC:
+ case IBV_QPT_UC:
+ size -= (sizeof(struct mlx4_wqe_ctrl_seg) +
+ sizeof(struct mlx4_wqe_raddr_seg));
+ break;
+
+ default:
+ return 0;
+ }
+
+ if (attr->cap.max_inline_data > size)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int mlx4_exp_alloc_qp_buf(struct ibv_context *context,
+ struct ibv_exp_qp_init_attr *attr,
+ struct mlx4_qp *qp)
+{
+ int ret;
+ enum mlx4_alloc_type alloc_type;
+ enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
+ const char *qp_huge_key;
+ int i, wqe_size;
+
+ qp->rq.max_gs = attr->cap.max_recv_sge;
+ wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
+ if ((attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) && (attr->max_inl_recv)) {
+ qp->max_inlr_sg = qp->rq.max_gs;
+ wqe_size = max(wqe_size, attr->max_inl_recv);
+ }
+ for (qp->rq.wqe_shift = 4; 1 << qp->rq.wqe_shift < wqe_size; qp->rq.wqe_shift++)
+ ; /* nothing */
+
+ if (qp->max_inlr_sg) {
+ attr->max_inl_recv = 1 << qp->rq.wqe_shift;
+ qp->max_inlr_sg = attr->max_inl_recv / sizeof(struct mlx4_wqe_data_seg);
+ }
+
+ if (qp->sq.wqe_cnt) {
+ qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
+ if (!qp->sq.wrid)
+ return -1;
+ }
+
+ if (qp->rq.wqe_cnt) {
+ qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t));
+ if (!qp->rq.wrid) {
+ free(qp->sq.wrid);
+ return -1;
+ }
+
+ if (qp->max_inlr_sg) {
+ qp->inlr_buff.buff = malloc(qp->rq.wqe_cnt * sizeof(*(qp->inlr_buff.buff)));
+ if (!qp->inlr_buff.buff) {
+ free(qp->sq.wrid);
+ free(qp->rq.wrid);
+ return -1;
+ }
+ qp->inlr_buff.len = qp->rq.wqe_cnt;
+ qp->inlr_buff.buff[0].sg_list = malloc(qp->rq.wqe_cnt *
+ sizeof(*(qp->inlr_buff.buff->sg_list)) *
+ qp->max_inlr_sg);
+ if (!qp->inlr_buff.buff->sg_list) {
+ free(qp->sq.wrid);
+ free(qp->rq.wrid);
+ free(qp->inlr_buff.buff);
+ return -1;
+ }
+ for (i = 1; i < qp->rq.wqe_cnt; i++)
+ qp->inlr_buff.buff[i].sg_list = &qp->inlr_buff.buff[0].sg_list[i * qp->max_inlr_sg];
+ }
+ }
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+ if (qp->buf_size) {
+ /* compatability support */
+ qp_huge_key = qptype2key(attr->qp_type);
+ if (mlx4_use_huge(context, qp_huge_key))
+ default_alloc_type = MLX4_ALLOC_TYPE_HUGE;
+
+
+ mlx4_get_alloc_type(context, MLX4_QP_PREFIX, &alloc_type,
+ default_alloc_type);
+
+ ret = mlx4_alloc_prefered_buf(to_mctx(context), &qp->buf,
+ align(qp->buf_size, to_mdev
+ (context->device)->page_size),
+ to_mdev(context->device)->page_size,
+ alloc_type,
+ MLX4_QP_PREFIX);
+
+ if (ret) {
+ free(qp->sq.wrid);
+ free(qp->rq.wrid);
+ if (qp->max_inlr_sg) {
+ free(qp->inlr_buff.buff[0].sg_list);
+ free(qp->inlr_buff.buff);
+ }
+ return -1;
+ }
+
+ memset(qp->buf.buf, 0, qp->buf_size);
+ if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+ qp->rq.buf = qp->buf.buf;
+ qp->sq.buf = qp->buf.buf + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+ } else {
+ qp->rq.buf = qp->buf.buf + (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ qp->sq.buf = qp->buf.buf;
+ }
+
+ } else {
+ qp->buf.buf = NULL;
+ }
+
+ return 0;
+}
+
+static uint64_t send_db_to_uar(uintptr_t send_db)
+{
+ return (send_db - MLX4_SEND_DOORBELL);
+}
+
+static uint32_t *uar_to_send_db(uintptr_t uar)
+{
+ return (uint32_t *)(uar + MLX4_SEND_DOORBELL);
+}
+
+static void update_qp_bf_data(struct mlx4_res_domain *res_domain,
+ struct mlx4_qp *qp, struct ibv_context *context)
+{
+ switch (res_domain->type) {
+ case MLX4_RES_DOMAIN_BF_SAFE:
+ qp->db_method = MLX4_QP_DB_METHOD_BF;
+ break;
+ case MLX4_RES_DOMAIN_BF_UNSAFE:
+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
+ break;
+ case MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT:
+ if (to_mctx(context)->prefer_bf)
+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB;
+ else
+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+ break;
+ default:
+ break;
+ }
+ qp->bf = &res_domain->send_db->bf;
+ qp->sdb = res_domain->send_db->db_addr;
+ qp->bf_buf_size = to_mctx(context)->bfs.buf_size;
+}
+
+struct ibv_qp *mlx4_exp_create_qp(struct ibv_context *context,
+ struct ibv_exp_qp_init_attr *attr)
+{
+ struct mlx4_qp *qp;
+ int ret;
+ union {
+ struct mlx4_create_qp basic;
+ struct mlx4_exp_create_qp extended;
+ } cmd_obj;
+ union {
+ struct ibv_create_qp_resp basic;
+ struct ibv_exp_create_qp_resp extended;
+ } resp_obj;
+ struct mlx4_create_qp_base *cmd = NULL;
+ int ext_kernel_cmd = 0;
+ struct mlx4_bfs_data *bfs = &to_mctx(context)->bfs;
+ int i;
+ unsigned char cq_update;
+ int thread_safe = !mlx4_single_threaded;
+ int db_method_defined = 0;
+
+ memset(&resp_obj, 0, sizeof(resp_obj));
+ memset(&cmd_obj, 0, sizeof(cmd_obj));
+
+ if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_RESERVED1) {
+ errno = ENOSYS;
+ return NULL;
+ }
+
+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) {
+ if (attr->srq)
+ attr->max_inl_recv = 0;
+ else
+ attr->max_inl_recv = min(attr->max_inl_recv,
+ (to_mctx(context)->max_sge *
+ sizeof(struct mlx4_wqe_data_seg)));
+ }
+
+ /* Sanity check QP size before proceeding */
+ if (verify_sizes(attr, to_mctx(context)))
+ return NULL;
+
+ if (attr->qp_type == IBV_QPT_XRC && attr->recv_cq &&
+ attr->cap.max_recv_wr > 0 && mlx4_trace)
+ fprintf(stderr, PFX "Warning: Legacy XRC sender should not use a recieve cq\n");
+
+ qp = calloc(1, sizeof(*qp));
+ if (!qp)
+ return NULL;
+
+ qp->qp_cap_cache = 0;
+ if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS)
+ ext_kernel_cmd = 1;
+ if (attr->qp_type == IBV_QPT_XRC_RECV) {
+ attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
+ } else {
+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG &&
+ attr->max_atomic_arg != 0) {
+ if (attr->max_atomic_arg == 8) {
+ qp->is_masked_atomic = 1;
+ } else {
+ fprintf(stderr, "%s: max_atomic_arg = %d is not valid for mlx4 (use 8 or 0)\n",
+ __FUNCTION__, attr->max_atomic_arg);
+ errno = EINVAL;
+ goto err;
+ }
+ }
+
+ mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
+ /*
+ * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+#ifdef MLX4_WQE_FORMAT
+ qp->sq_spare_wqes = 0;
+#else
+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+#endif
+ qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
+ }
+
+ if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
+ attr->qp_type == IBV_QPT_XRC_RECV ||
+ attr->qp_type == IBV_QPT_XRC) {
+ attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV)
+ attr->max_inl_recv = 0;
+ } else {
+ qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
+ if (attr->cap.max_recv_sge < 1)
+ attr->cap.max_recv_sge = 1;
+ if (attr->cap.max_recv_wr < 1)
+ attr->cap.max_recv_wr = 1;
+ }
+
+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS)
+ qp->create_flags = attr->exp_create_flags & IBV_EXP_QP_CREATE_MASK;
+
+ if (mlx4_exp_alloc_qp_buf(context, attr, qp))
+ goto err;
+
+ mlx4_init_qp_indices(qp);
+
+ qp->sdb = (uint32_t *) (to_mctx(context)->uar + MLX4_SEND_DOORBELL);
+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_RES_DOMAIN) {
+ struct mlx4_res_domain *rd;
+
+ if (!attr->res_domain) {
+ errno = EINVAL;
+ goto err_free;
+ }
+ rd = to_mres_domain(attr->res_domain);
+ if (rd->attr.thread_model == IBV_EXP_THREAD_UNSAFE ||
+ rd->attr.thread_model == IBV_EXP_THREAD_SINGLE)
+ thread_safe = 0;
+
+ if (rd->send_db) {
+ cmd_obj.extended.exp_cmd.uar_virt_add = send_db_to_uar((uintptr_t)rd->send_db->db_addr);
+ update_qp_bf_data(rd, qp, context);
+ db_method_defined = 1;
+ }
+ }
+
+ if (mlx4_lock_init(&qp->sq.lock, thread_safe, mlx4_get_locktype()))
+ goto err_free;
+ if (mlx4_lock_init(&qp->rq.lock, thread_safe, mlx4_get_locktype()))
+ goto sq_lock_destroy;
+
+ cmd = (ext_kernel_cmd ?
+ &cmd_obj.extended.exp_cmd.base : &cmd_obj.basic.base);
+
+ if (attr->cap.max_recv_sge) {
+ qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+ if (!qp->db)
+ goto rq_lock_destroy;
+
+ *qp->db = 0;
+ cmd->db_addr = (uintptr_t) qp->db;
+ } else {
+ cmd->db_addr = 0;
+ }
+
+ cmd->buf_addr = (uintptr_t) qp->buf.buf;
+ cmd->log_sq_stride = qp->sq.wqe_shift;
+ for (cmd->log_sq_bb_count = 0;
+ qp->sq.wqe_cnt > 1 << cmd->log_sq_bb_count;
+ ++cmd->log_sq_bb_count)
+ ; /* nothing */
+ cmd->sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */
+ memset(cmd->reserved, 0, sizeof(cmd->reserved));
+
+ pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
+ ret = ibv_exp_cmd_create_qp(context, &qp->verbs_qp,
+ sizeof(qp->verbs_qp), attr,
+ ext_kernel_cmd ?
+ (void *)&cmd_obj.extended.ibv_cmd :
+ (void *)&cmd_obj.basic.ibv_cmd,
+ ext_kernel_cmd ?
+ sizeof(cmd_obj.extended.ibv_cmd) :
+ sizeof(cmd_obj.basic.ibv_cmd),
+ ext_kernel_cmd ?
+ sizeof(cmd_obj.extended.exp_cmd) :
+ sizeof(cmd_obj.basic.base),
+ ext_kernel_cmd ?
+ (void *)&resp_obj.extended : (void *)&resp_obj.basic,
+ ext_kernel_cmd ?
+ sizeof(resp_obj.extended) :
+ sizeof(resp_obj.basic),
+ 0, 0);
+ if (ret) {
+ errno = ret;
+ goto err_rq_db;
+ }
+
+ if (qp->max_inlr_sg && (attr->max_inl_recv != (1 << qp->rq.wqe_shift)))
+ goto err_destroy;
+
+ if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
+ ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
+ if (ret)
+ goto err_destroy;
+ }
+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+
+ qp->rq.wqe_cnt = attr->cap.max_recv_wr;
+ qp->rq.max_gs = attr->cap.max_recv_sge;
+
+ /* adjust rq maxima to not exceed reported device maxima */
+ attr->cap.max_recv_wr = min(to_mctx(context)->max_qp_wr,
+ attr->cap.max_recv_wr);
+ attr->cap.max_recv_sge = min(to_mctx(context)->max_sge,
+ attr->cap.max_recv_sge);
+
+ qp->rq.max_post = attr->cap.max_recv_wr;
+ if (attr->qp_type != IBV_QPT_XRC_RECV)
+ mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
+
+ qp->doorbell_qpn = htonl(qp->verbs_qp.qp.qp_num << 8);
+ if (attr->sq_sig_all)
+ cq_update = MLX4_WQE_CTRL_CQ_UPDATE;
+ else
+ cq_update = 0;
+
+ /*
+ * The rcrb_flags_tbl is a table to get the right value for the first
+ * byte of srcrb_flags field on the WQE ctrl segment.
+ * The value is derived from the QP sq_sig_all flag and the 4 WR flags
+ * IBV_EXP_SEND_SIGNALED, IBV_EXP_SEND_SOLICITED, IBV_EXP_SEND_IP_CSUM
+ * and IBV_EXP_SEND_TUNNEL.
+ * These flags used as an index to get the required value from the table.
+ * The IBV_EXP_SEND_SIGNALED flag defines first bit of the index the
+ * IBV_EXP_SEND_SOLICITED defines the second bit the IBV_EXP_SEND_IP_CSUM
+ * defines the third bit and IBV_EXP_SEND_TUNNEL the fourth one.
+ * Therefore to calculate the index we can use:
+ * idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED |
+ * (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) |
+ * (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2);
+ * (exp_send_flags & IBV_EXP_SEND_TUNNEL)/(IBV_EXP_SEND_TUNNEL >> 3);
+ */
+ qp->srcrb_flags_tbl[0] = cq_update;
+ qp->srcrb_flags_tbl[1] = MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+ qp->srcrb_flags_tbl[2] = MLX4_WQE_CTRL_SOLICIT | cq_update;
+ qp->srcrb_flags_tbl[3] = MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+ qp->srcrb_flags_tbl[4] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | cq_update;
+ qp->srcrb_flags_tbl[5] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+ qp->srcrb_flags_tbl[6] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_SOLICIT | cq_update;
+ qp->srcrb_flags_tbl[7] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+ qp->srcrb_flags_tbl[8] = cq_update;
+ qp->srcrb_flags_tbl[9] = MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+ qp->srcrb_flags_tbl[10] = MLX4_WQE_CTRL_SOLICIT | cq_update;
+ qp->srcrb_flags_tbl[11] = MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+ qp->srcrb_flags_tbl[12] = MLX4_WQE_CTRL_IP_CSUM | cq_update;
+ qp->srcrb_flags_tbl[13] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+ qp->srcrb_flags_tbl[14] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_SOLICIT | cq_update;
+ qp->srcrb_flags_tbl[15] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+
+ qp->qp_type = attr->qp_type;
+
+ /* Set default value of cached RX csum flags to 0 */
+ qp->cached_rx_csum_flags = 0;
+ /* Set transposed_rx_csum_flags to match the cached_rx_csum_flags = 0 */
+ qp->transposed_rx_csum_flags = IBV_EXP_CQ_RX_OUTER_IPV6_PACKET;
+
+ if (!db_method_defined && bfs->buf_size == 0) {
+ /* not using BF */
+ qp->db_method = MLX4_QP_DB_METHOD_DB;
+ } else if (!db_method_defined) {
+ /*
+ * To gain performance the dedic_bf_free is first tested without taking
+ * the dedic_bf_lock.
+ */
+ if (bfs->dedic_bf_free) {
+ mlx4_spin_lock(&bfs->dedic_bf_lock);
+ for (i = 0 ; i < bfs->num_dedic_bfs; i++) {
+ if (!bfs->dedic_bf_used[i]) {
+ /* using dedicated BF */
+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
+ qp->bf = (union mlx4_bf *)(&bfs->dedic_bf[i]);
+ bfs->dedic_bf_used[i] = 1;
+ bfs->dedic_bf_free--;
+ break;
+ }
+ }
+ mlx4_spin_unlock(&bfs->dedic_bf_lock);
+ }
+ if (!qp->bf) {
+ /* using common BF */
+ if (mlx4_single_threaded)
+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
+ else
+ qp->db_method = MLX4_QP_DB_METHOD_BF;
+ qp->bf = (union mlx4_bf *)(&bfs->cmn_bf);
+ }
+ if (qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF &&
+ mlx4_single_threaded && (wc_auto_evict_size() == 64)) {
+ if (to_mctx(context)->prefer_bf)
+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB;
+ else
+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+ }
+ qp->bf_buf_size = bfs->buf_size;
+ }
+
+ qp->model_flags = thread_safe ? MLX4_QP_MODEL_FLAG_THREAD_SAFE : 0;
+ mlx4_update_post_send_one(qp);
+ qp->pattern = MLX4_QP_PATTERN;
+
+ return &qp->verbs_qp.qp;
+
+err_destroy:
+ ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
+
+err_rq_db:
+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+ if (attr->cap.max_recv_sge)
+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
+
+rq_lock_destroy:
+ mlx4_lock_destroy(&qp->rq.lock);
+
+sq_lock_destroy:
+ mlx4_lock_destroy(&qp->sq.lock);
+
+err_free:
+ mlx4_dealloc_qp_buf(context, qp);
+
+err:
+ free(qp);
+
+ return NULL;
+}
+
+int mlx4_exp_query_device(struct ibv_context *context,
+ struct ibv_exp_device_attr *device_attr)
+{
+ struct ibv_exp_query_device cmd;
+ struct ibv_port_attr port_attr;
+ uint64_t raw_fw_ver;
+ int ret;
+ int i;
+
+ ret = ibv_exp_cmd_query_device(context, device_attr, &raw_fw_ver,
+ &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (device_attr->exp_device_cap_flags & IBV_EXP_DEVICE_CROSS_CHANNEL) {
+ device_attr->comp_mask |= IBV_EXP_DEVICE_ATTR_CALC_CAP;
+ device_attr->calc_cap.data_types = (1ULL << IBV_EXP_CALC_DATA_TYPE_INT) |
+ (1ULL << IBV_EXP_CALC_DATA_TYPE_UINT) |
+ (1ULL << IBV_EXP_CALC_DATA_TYPE_FLOAT);
+ device_attr->calc_cap.data_sizes = (1ULL << IBV_EXP_CALC_DATA_SIZE_64_BIT);
+ device_attr->calc_cap.int_ops = (1ULL << IBV_EXP_CALC_OP_ADD) |
+ (1ULL << IBV_EXP_CALC_OP_BAND) |
+ (1ULL << IBV_EXP_CALC_OP_BXOR) |
+ (1ULL << IBV_EXP_CALC_OP_BOR);
+ device_attr->calc_cap.uint_ops = device_attr->calc_cap.int_ops;
+ device_attr->calc_cap.fp_ops = device_attr->calc_cap.int_ops;
+ }
+ device_attr->exp_device_cap_flags |= IBV_EXP_DEVICE_MR_ALLOCATE;
+
+ if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS) &&
+ (device_attr->exp_device_cap_flags & (IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT |
+ IBV_EXP_DEVICE_RX_CSUM_IP_PKT |
+ IBV_EXP_DEVICE_VXLAN_SUPPORT))) {
+ for (i = 0; i < device_attr->phys_port_cnt; i++) {
+ ret = mlx4_query_port(context, i + 1, &port_attr);
+ if (ret)
+ return ret;
+
+ if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+ device_attr->exp_device_cap_flags &= ~(IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT |
+ IBV_EXP_DEVICE_RX_CSUM_IP_PKT |
+ IBV_EXP_DEVICE_VXLAN_SUPPORT);
+ break;
+ }
+ }
+ }
+
+ return __mlx4_query_device(
+ raw_fw_ver,
+ (struct ibv_device_attr *)device_attr);
+}
+
+int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num,
+ struct ibv_exp_port_attr *port_attr)
+{
+ /* Check that only valid flags were given */
+ if (!(port_attr->comp_mask & IBV_EXP_QUERY_PORT_ATTR_MASK1) ||
+ (port_attr->comp_mask & ~IBV_EXP_QUERY_PORT_ATTR_MASKS) ||
+ (port_attr->mask1 & ~IBV_EXP_QUERY_PORT_MASK)) {
+ return EINVAL;
+ }
+
+ /* Optimize the link type query */
+ if (port_attr->comp_mask == IBV_EXP_QUERY_PORT_ATTR_MASK1) {
+ if (!(port_attr->mask1 & ~(IBV_EXP_QUERY_PORT_LINK_LAYER |
+ IBV_EXP_QUERY_PORT_CAP_FLAGS))) {
+ struct mlx4_context *mctx = to_mctx(context);
+ if (port_num <= 0 || port_num > MLX4_PORTS_NUM)
+ return EINVAL;
+ if (mctx->port_query_cache[port_num - 1].valid) {
+ if (port_attr->mask1 &
+ IBV_EXP_QUERY_PORT_LINK_LAYER)
+ port_attr->link_layer =
+ mctx->
+ port_query_cache[port_num - 1].
+ link_layer;
+ if (port_attr->mask1 &
+ IBV_EXP_QUERY_PORT_CAP_FLAGS)
+ port_attr->port_cap_flags =
+ mctx->
+ port_query_cache[port_num - 1].
+ caps;
+ return 0;
+ }
+ }
+ if (port_attr->mask1 & IBV_EXP_QUERY_PORT_STD_MASK) {
+ return mlx4_query_port(context, port_num,
+ &port_attr->port_attr);
+ }
+ }
+
+ return EOPNOTSUPP;
+}
+
+struct ibv_ah *mlx4_exp_create_ah(struct ibv_pd *pd,
+ struct ibv_exp_ah_attr *attr_ex)
+{
+ struct ibv_exp_port_attr port_attr;
+ struct ibv_ah *ah;
+ struct mlx4_ah *mah;
+
+ port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1;
+ port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER;
+
+ if (ibv_exp_query_port(pd->context, attr_ex->port_num, &port_attr))
+ return NULL;
+
+ ah = mlx4_create_ah_common(pd, (struct ibv_ah_attr *)attr_ex,
+ port_attr.link_layer);
+
+ if (NULL == ah)
+ return NULL;
+
+ mah = to_mah(ah);
+
+ /* If vlan was given, check that we could use it */
+ if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID &&
+ attr_ex->vid <= 0xfff &&
+ (0 == attr_ex->ll_address.len ||
+ !(attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL)))
+ goto err;
+
+ /* ll_address.len == 0 means no ll address given */
+ if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL &&
+ 0 != attr_ex->ll_address.len) {
+ if (LL_ADDRESS_ETH != attr_ex->ll_address.type ||
+ port_attr.link_layer != IBV_LINK_LAYER_ETHERNET)
+ /* mlx4 provider currently only support ethernet
+ * extensions */
+ goto err;
+
+ /* link layer is ethernet */
+ if (6 != attr_ex->ll_address.len ||
+ NULL == attr_ex->ll_address.address)
+ goto err;
+
+ memcpy(mah->mac, attr_ex->ll_address.address,
+ attr_ex->ll_address.len);
+
+ if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID &&
+ attr_ex->vid <= 0xfff) {
+ mah->av.port_pd |= htonl(1 << 29);
+ mah->vlan = attr_ex->vid |
+ ((attr_ex->sl & 7) << 13);
+ }
+ }
+
+ return ah;
+
+err:
+ free(ah);
+ return NULL;
+}
+
+static struct mlx4_send_db_data *allocate_send_db(struct mlx4_context *ctx)
+{
+ struct mlx4_device *dev = to_mdev(ctx->ibv_ctx.device);
+ struct mlx4_send_db_data *send_db = NULL;
+ unsigned int uar_idx;
+ void *uar;
+ void *bfs;
+ int i;
+
+ if (!ctx->max_ctx_res_domain || !ctx->bfs.buf_size) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ mlx4_spin_lock(&ctx->send_db_lock);
+ if (!list_empty(&ctx->send_db_list)) {
+ send_db = list_entry(ctx->send_db_list.next, struct mlx4_send_db_data, list);
+ list_del(&send_db->list);
+ }
+ mlx4_spin_unlock(&ctx->send_db_lock);
+
+ if (!send_db) {
+ /* Fill up more send_db objects */
+ mlx4_spin_lock(&ctx->send_db_lock);
+ if ((ctx->send_db_num_uars + 1) * ctx->bf_regs_per_page >= ctx->max_ctx_res_domain) {
+ mlx4_spin_unlock(&ctx->send_db_lock);
+ errno = ENOMEM;
+ return NULL;
+ }
+ uar_idx = ctx->send_db_num_uars;
+ ctx->send_db_num_uars++;
+ mlx4_spin_unlock(&ctx->send_db_lock);
+
+ uar = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED,
+ ctx->ibv_ctx.cmd_fd,
+ dev->page_size * (MLX4_IB_EXP_MMAP_EXT_UAR_PAGE |
+ (uar_idx << MLX4_MMAP_CMD_BITS)));
+ if (uar == MAP_FAILED)
+ return NULL;
+ bfs = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED,
+ ctx->ibv_ctx.cmd_fd,
+ dev->page_size * (MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE |
+ (uar_idx << MLX4_MMAP_CMD_BITS)));
+ if (bfs == MAP_FAILED) {
+ munmap(uar, dev->page_size);
+ return NULL;
+ }
+ mlx4_spin_lock(&ctx->send_db_lock);
+ for (i = 0; i < ctx->bf_regs_per_page; i++) {
+ send_db = calloc(1, sizeof(*send_db));
+ if (!send_db) {
+ if (i)
+ break;
+ mlx4_spin_unlock(&ctx->send_db_lock);
+ errno = ENOMEM;
+ return NULL;
+ }
+
+ mlx4_lock_init(&send_db->bf.cmn.lock,
+ !mlx4_single_threaded,
+ mlx4_get_locktype());
+
+ send_db->db_addr = uar_to_send_db((uintptr_t)uar);
+
+ /* Allocate a pair of blue-flames to toggle sends between them */
+ send_db->bf.cmn.address = bfs + (i * ctx->bfs.buf_size * 2);
+ list_add(&send_db->list, &ctx->send_db_list);
+ }
+
+ /* Return the last send_db object to the caller */
+ list_del(&send_db->list);
+ mlx4_spin_unlock(&ctx->send_db_lock);
+ }
+
+ return send_db;
+}
+
+static void free_send_db(struct mlx4_context *ctx,
+ struct mlx4_send_db_data *send_db)
+{
+ mlx4_spin_lock(&ctx->send_db_lock);
+ list_add(&send_db->list, &ctx->send_db_list);
+ mlx4_spin_unlock(&ctx->send_db_lock);
+}
+
+struct ibv_exp_res_domain *mlx4_exp_create_res_domain(struct ibv_context *context,
+ struct ibv_exp_res_domain_init_attr *attr)
+{
+ struct mlx4_context *ctx = to_mctx(context);
+ struct mlx4_res_domain *res_domain;
+
+ if (attr->comp_mask >= IBV_EXP_RES_DOMAIN_RESERVED) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ res_domain = calloc(1, sizeof(*res_domain));
+ if (!res_domain) {
+ errno = ENOMEM;
+ return NULL;
+ }
+
+ res_domain->ibv_res_domain.context = context;
+
+ /* set default values */
+ res_domain->attr.thread_model = IBV_EXP_THREAD_SAFE;
+ res_domain->attr.msg_model = IBV_EXP_MSG_DEFAULT;
+ /* get requested valid values */
+ if (attr->comp_mask & IBV_EXP_RES_DOMAIN_THREAD_MODEL)
+ res_domain->attr.thread_model = attr->thread_model;
+ if (attr->comp_mask & IBV_EXP_RES_DOMAIN_MSG_MODEL)
+ res_domain->attr.msg_model = attr->msg_model;
+ res_domain->attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL |
+ IBV_EXP_RES_DOMAIN_MSG_MODEL;
+ /*
+ * Allocate BF for every resource domain since BF is improving
+ * both BW and latency of single message.
+ */
+ res_domain->send_db = allocate_send_db(ctx);
+
+ /* define resource domain type */
+ if (!res_domain->send_db) {
+ if (res_domain->attr.msg_model == IBV_EXP_MSG_FORCE_LOW_LATENCY)
+ /*
+ * Fail in case user asked for force low-latency
+ * resource-domain but we can't allocate
+ * dedicated BF.
+ */
+ goto err;
+ else
+ /*
+ * Dedicated BF is not allocated for the
+ * resource-domain.
+ */
+ res_domain->type = MLX4_RES_DOMAIN_BF_NONE;
+ } else {
+ /*
+ * In case dedicated BF allocated set the
+ * resource-domain type according to the
+ * thread-model
+ */
+ switch (res_domain->attr.thread_model) {
+ case IBV_EXP_THREAD_SAFE:
+ res_domain->type = MLX4_RES_DOMAIN_BF_SAFE;
+ break;
+ case IBV_EXP_THREAD_UNSAFE:
+ res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE;
+ break;
+ case IBV_EXP_THREAD_SINGLE:
+ if (wc_auto_evict_size() == 64)
+ res_domain->type = MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT;
+ else
+ res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE;
+ break;
+ }
+ }
+
+ return &res_domain->ibv_res_domain;
+
+err:
+ free(res_domain);
+
+ return NULL;
+}
+
+int mlx4_exp_destroy_res_domain(struct ibv_context *context,
+ struct ibv_exp_res_domain *res_dom,
+ struct ibv_exp_destroy_res_domain_attr *attr)
+{
+ struct mlx4_res_domain *res_domain = to_mres_domain(res_dom);
+
+ if (res_domain->send_db)
+ free_send_db(to_mctx(context), res_domain->send_db);
+
+ free(res_domain);
+
+ return 0;
+}
+
+void *mlx4_exp_query_intf(struct ibv_context *context, struct ibv_exp_query_intf_params *params,
+ enum ibv_exp_query_intf_status *status)
+{
+ void *family = NULL;
+ struct mlx4_qp *qp;
+ struct mlx4_cq *cq;
+
+ *status = IBV_EXP_INTF_STAT_OK;
+
+ if (!params->obj) {
+ errno = EINVAL;
+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ;
+
+ return NULL;
+ }
+
+ if (params->intf_version > MLX4_MAX_FAMILY_VER) {
+ *status = IBV_EXP_INTF_STAT_VERSION_NOT_SUPPORTED;
+
+ return NULL;
+ }
+
+ switch (params->intf) {
+ case IBV_EXP_INTF_QP_BURST:
+ qp = to_mqp(params->obj);
+ if (qp->pattern == MLX4_QP_PATTERN) {
+ family = mlx4_get_qp_burst_family(qp, params, status);
+ if (*status != IBV_EXP_INTF_STAT_OK) {
+ fprintf(stderr, PFX "Failed to get QP burst family\n");
+ errno = EINVAL;
+ }
+ } else {
+ fprintf(stderr, PFX "Warning: non-valid QP passed to query interface\n");
+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ;
+ errno = EINVAL;
+ }
+ break;
+
+ case IBV_EXP_INTF_CQ:
+ cq = to_mcq(params->obj);
+ if (cq->pattern == MLX4_CQ_PATTERN) {
+ family = (void *)mlx4_get_poll_cq_family(cq, params, status);
+ } else {
+ fprintf(stderr, PFX "Warning: non-valid CQ passed to query interface\n");
+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ;
+ errno = EINVAL;
+ }
+ break;
+
+ default:
+ *status = IBV_EXP_INTF_STAT_INTF_NOT_SUPPORTED;
+ errno = EINVAL;
+ }
+
+ return family;
+}
+
+int mlx4_exp_release_intf(struct ibv_context *context, void *intf,
+ struct ibv_exp_release_intf_params *params)
+{
+ return 0;
+}
Index: contrib/ofed/libmlx4/src/wqe.h
===================================================================
--- contrib/ofed/libmlx4/src/wqe.h
+++ contrib/ofed/libmlx4/src/wqe.h
@@ -38,9 +38,19 @@
};
enum {
- MLX4_WQE_CTRL_FENCE = 1 << 6,
- MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
- MLX4_WQE_CTRL_SOLICIT = 1 << 1,
+ MLX4_WQE_CTRL_FENCE = 1 << 6,
+ MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
+ MLX4_WQE_CTRL_SOLICIT = 1 << 1,
+ MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7,
+ MLX4_WQE_CTRL_IIP = 1 << 28,
+ MLX4_WQE_CTRL_IL4 = 1 << 27,
+ MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5,
+ MLX4_WQE_CTRL_IP_CSUM = 1 << 4,
+};
+
+enum {
+ MLX4_WQE_BIND_TYPE_2 = (1<<31),
+ MLX4_WQE_BIND_ZERO_BASED = (1<<30),
};
enum {
@@ -54,8 +64,7 @@
struct mlx4_wqe_ctrl_seg {
uint32_t owner_opcode;
- uint16_t vlan_tag;
- uint8_t ins_vlan;
+ uint8_t reserved[3];
uint8_t fence_size;
/*
* High 24 bits are SRC remote buffer; low 8 bits are flags:
@@ -66,7 +75,10 @@
* [1] SE (solicited event)
* [0] FL (force loopback)
*/
- uint32_t xrcrb_flags;
+ union {
+ uint32_t srcrb_flags;
+ uint16_t srcrb_flags16[2];
+ };
/*
* imm is immediate data for send/RDMA write w/ immediate;
* also invalidation key for send with invalidate; input
@@ -99,6 +111,19 @@
uint32_t reserved2[3];
};
+struct mlx4_wqe_local_inval_seg {
+ uint64_t reserved1;
+ uint32_t mem_key;
+ uint32_t reserved2;
+ uint64_t reserved3[2];
+};
+
+enum {
+ MLX4_WQE_MW_REMOTE_READ = 1 << 29,
+ MLX4_WQE_MW_REMOTE_WRITE = 1 << 30,
+ MLX4_WQE_MW_ATOMIC = 1 << 31
+};
+
struct mlx4_wqe_raddr_seg {
uint64_t raddr;
uint32_t rkey;
@@ -110,6 +135,13 @@
uint64_t compare;
};
+struct mlx4_wqe_masked_atomic_seg {
+ uint64_t swap_data;
+ uint64_t cmp_data;
+ uint64_t swap_mask;
+ uint64_t cmp_mask;
+};
+
struct mlx4_wqe_bind_seg {
uint32_t flags1;
uint32_t flags2;
@@ -119,4 +151,11 @@
uint64_t length;
};
+struct mlx4_wqe_wait_en_seg {
+ uint32_t valid;
+ uint32_t resv;
+ uint32_t pi;
+ uint32_t obj_num;
+};
+
#endif /* WQE_H */
Index: contrib/ofed/usr.lib/libmlx4/Makefile
===================================================================
--- contrib/ofed/usr.lib/libmlx4/Makefile
+++ contrib/ofed/usr.lib/libmlx4/Makefile
@@ -14,7 +14,7 @@
SHLIB_MAJOR= 1
MK_PROFILE= no
-SRCS= buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c
+SRCS= buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c verbs_exp.c
LIBADD= ibverbs pthread
CFLAGS+= -DHAVE_CONFIG_H
Index: contrib/ofed/usr.lib/libmlx4/config.h
===================================================================
--- contrib/ofed/usr.lib/libmlx4/config.h
+++ contrib/ofed/usr.lib/libmlx4/config.h
@@ -1,4 +1,3 @@
-#define HAVE_IBV_DONTFORK_RANGE
-#define HAVE_IBV_DOFORK_RANGE
-#define HAVE_IBV_REGISTER_DRIVER
-#define HAVE_IBV_READ_SYSFS_FILE
+#define HAVE_IBV_DOFORK_RANGE 1
+#define HAVE_IBV_DONTFORK_RANGE 1
+#define HAVE_IBV_REGISTER_DRIVER 1

File Metadata

Mime Type
text/plain
Expires
Wed, Feb 11, 4:26 AM (5 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28637975
Default Alt Text
D5793.diff (308 KB)

Event Timeline