D5793.diff
No OneTemporary
Actions

Size

308 KB

Referenced Files

None

Subscribers

None

D5793.diff
View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: contrib/ofed/libmlx4/Makefile.am
	===================================================================
	--- contrib/ofed/libmlx4/Makefile.am
	+++ contrib/ofed/libmlx4/Makefile.am
	@@ -1,12 +1,19 @@
	-AM_CFLAGS = -g -Wall -D_GNU_SOURCE
	+AM_CFLAGS = -g -Wall -Werror -D_GNU_SOURCE

	mlx4_version_script = @MLX4_VERSION_SCRIPT@

	MLX4_SOURCES = src/buf.c src/cq.c src/dbrec.c src/mlx4.c src/qp.c \
	- src/srq.c src/verbs.c
	+ src/srq.c src/verbs.c src/verbs_exp.c
	+noinst_HEADERS = src/bitmap.h src/doorbell.h src/list.h src/mlx4-abi.h src/mlx4_exp.h src/mlx4.h src/wqe.h

	if HAVE_IBV_DEVICE_LIBRARY_EXTENSION
	- lib_LTLIBRARIES = src/libmlx4.la
	+ lib_LTLIBRARIES =
	+else
	+ mlx4lib_LTLIBRARIES =
	+endif
	+
	+if HAVE_IBV_DEVICE_LIBRARY_EXTENSION
	+ lib_LTLIBRARIES += src/libmlx4.la
	src_libmlx4_la_SOURCES = $(MLX4_SOURCES)
	src_libmlx4_la_LDFLAGS = -avoid-version -release @IBV_DEVICE_LIBRARY_EXTENSION@ \
	$(mlx4_version_script)
	@@ -14,13 +21,14 @@
	mlx4conf_DATA = mlx4.driver
	else
	mlx4libdir = $(libdir)/infiniband
	- mlx4lib_LTLIBRARIES = src/mlx4.la
	+ mlx4lib_LTLIBRARIES += src/mlx4.la
	src_mlx4_la_SOURCES = $(MLX4_SOURCES)
	src_mlx4_la_LDFLAGS = -avoid-version -module $(mlx4_version_script)
	endif

	-EXTRA_DIST = src/doorbell.h src/mlx4.h src/mlx4-abi.h src/wqe.h \
	- src/mlx4.map libmlx4.spec.in mlx4.driver
	+EXTRA_DIST = src/mlx4.map libmlx4.spec.in mlx4.driver
	+EXTRA_DIST += debian
	+EXTRA_DIST += autogen.sh

	dist-hook: libmlx4.spec
	cp libmlx4.spec $(distdir)
	Index: contrib/ofed/libmlx4/autogen.sh
	===================================================================
	--- contrib/ofed/libmlx4/autogen.sh
	+++ contrib/ofed/libmlx4/autogen.sh
	@@ -1,4 +1,4 @@
	-#! /bin/sh
	+#! /bin/sh -eE

	set -x
	aclocal -I config
	Index: contrib/ofed/libmlx4/configure.ac
	===================================================================
	--- contrib/ofed/libmlx4/configure.ac
	+++ contrib/ofed/libmlx4/configure.ac
	@@ -1,12 +1,15 @@
	dnl Process this file with autoconf to produce a configure script.

	AC_PREREQ(2.57)
	-AC_INIT(libmlx4, 1.0, general@lists.openfabrics.org)
	+AC_INIT(libmlx4, 1.0.6mlnx1, linux-rdma@vger.kernel.org)
	AC_CONFIG_SRCDIR([src/mlx4.h])
	AC_CONFIG_AUX_DIR(config)
	-AM_CONFIG_HEADER(config.h)
	-AM_INIT_AUTOMAKE(libmlx4, 1.0)
	-AM_PROG_LIBTOOL
	+AC_CONFIG_HEADER(config.h)
	+AM_INIT_AUTOMAKE([1.10 foreign tar-ustar silent-rules subdir-objects])
	+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
	+
	+AC_PROG_LIBTOOL
	+LT_INIT

	AC_ARG_WITH([valgrind],
	AC_HELP_STRING([--with-valgrind],
	@@ -21,6 +24,13 @@
	fi
	fi

	+#--with-wqe-format
	+AC_ARG_WITH([wqe-format],
	+ AC_HELP_STRING([--with-wqe-format],
	+ [Enable wqe-format annotations (default NO)]),
	+ AC_DEFINE([MLX4_WQE_FORMAT], 1, [Define to 1 to enable wqe-foramt annotations.]),
	+)
	+
	dnl Checks for programs
	AC_PROG_CC

	@@ -32,22 +42,19 @@
	AC_CHECK_HEADER(infiniband/driver.h, [],
	AC_MSG_ERROR([<infiniband/driver.h> not found. libmlx4 requires libibverbs.]))
	AC_HEADER_STDC
	-AC_CHECK_HEADER(valgrind/memcheck.h,
	- [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1,
	- [Define to 1 if you have the <valgrind/memcheck.h> header file.])],
	- [if test $want_valgrind = yes; then
	- AC_MSG_ERROR([Valgrind memcheck support requested, but <valgrind/memcheck.h> not found.])
	- fi])
	+
	+if test x$want_valgrind = xyes; then
	+ AC_CHECK_HEADER(valgrind/memcheck.h,
	+ [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1,
	+ [Define to 1 if you have the <valgrind/memcheck.h> header file.])],
	+ [if test $want_valgrind = yes; then
	+ AC_MSG_ERROR([Valgrind memcheck support requested, but <valgrind/memcheck.h> not found.])
	+ fi])
	+fi

	dnl Checks for typedefs, structures, and compiler characteristics.
	AC_C_CONST
	AC_CHECK_SIZEOF(long)
	-AC_CHECK_MEMBER(struct ibv_context.more_ops,
	- [AC_DEFINE([HAVE_IBV_MORE_OPS], 1, [Define to 1 if more_ops is a member of ibv_context])],,
	- [#include <infiniband/verbs.h>])
	-AC_CHECK_MEMBER(struct ibv_more_ops.create_xrc_srq,
	- [AC_DEFINE([HAVE_IBV_XRC_OPS], 1, [Define to 1 if have xrc ops])],,
	- [#include <infiniband/verbs.h>])

	dnl Checks for library functions
	AC_CHECK_FUNC(ibv_read_sysfs_file, [],
	Index: contrib/ofed/libmlx4/debian/changelog
	===================================================================
	--- contrib/ofed/libmlx4/debian/changelog
	+++ contrib/ofed/libmlx4/debian/changelog
	@@ -1,8 +1,201 @@
	-libmlx4 (1.0-2) unstable; urgency=low
	+libmlx4 (1.0.6mlnx1-1) unstable; urgency=low

	- * Add debian/watch file
	+ * libmlx4: Fix MR address change in rereg_mr
	+ * libmlx4: revert the endianess fix for immediate data
	+ * libmlx4: split post_send_one to qp types
	+ * libmlx4: Add post_send_one to qp struct
	+ * libmlx4: remove inl from basic set_data_seg functions
	+ * libmlx4: Set data segment in one function
	+ * libmlx4: set ctrl segment in one funtion
	+ * libmlx4: use htonl when copy immediate data to WQE
	+ * libmlx4: fix bug in bf_buf_size update
	+ * libmlx4: Define set_data_seg as inline function
	+ * libmlx4: reduce cache used by datapath
	+ * libmlx4: optimize wq_overflow
	+ * libmlx4: Add anothe DB ringing method
	+ * libmlx4: Use x86_64 SSE2 instructions to improve bf_copy
	+ * libmlx4: Add new DB ringing mode
	+ * libmlx4: use all 8 BFs
	+ * libmlx4: split ring_db function
	+ * libmlx4: add door-bell ring function
	+ * Modify call from ibv_exp_getenv to ibv_exp_cmd_getenv
	+ * libmlx4: fix contiguous page registration
	+ * Modify to use verbs specific getenv
	+ * libmlx4: avoid creating AH with DLID 0
	+ * libmlx4: fixed resize cq overrun bug
	+ * libmlx4.spec.in: Changed valgrind libs DESTDIR
	+ * Added valgrind support
	+ * fixed and added valgrind Macros
	+ * Adding experimental dereg_mr support
	+ * shared_mr: handle duplication from glob/procfs
	+ * shared_mr: fine-tuned counter mode name
	+ * fix 32 bit compile warning
	+ * shared mr with counter name support
	+ * libmlx4: allow user to specify the addr of contig pages.
	+ * libmlx4: avoid using gettimeofday in mlx4_reg_shared_mr.
	+ * libmlx4: init exp_mw_bind.
	+ * libmlx4: added -Werror to Makefile
	+ * ibmlx4: Use masked atomics only if max_atomic_arg defined
	+ * wc_flags should be set even when using experimental verbs
	+ * libmlx4: return errno on ibv_post_srq_recv
	+ * libmlx4: Retry open shared mr file
	+ * libmlx4: Add completion opcodes for masked atomic operations
	+ * Verify hop_limit > 1 in create_ah
	+ * libmlx4.spec.in: Support configure_options flag.
	+ * configure: Update AM_INIT_AUTOMAKE to support new auto tools.
	+ * Add MR re-registeration
	+ * mlx4: Add support for timestamping when initiating context.
	+ * libmlx4: Do not publish support for IBV_CALC_OP_MAXLOC
	+ * Fix comp_mask handling in ibv_exp_query_values
	+ * libmlx4: Simplify extended atomics API
	+ * libmlx4: Fix wrong wqe pointer advance
	+ * libmlx4: Add support for masked atomics
	+ * Revert "libmlx4: Fix log function to avoid overflow"
	+ * libmlx4: add ibv_exp_modify_qp to mlx4
	+ * libmlx4: Fix overflow on flag mask
	+ * libmlx4: Fix log function to avoid overflow
	+ * libmlx4: improve experimental interface
	+ * A correct AH was free'd by mistake
	+ * Align create_ah_ex and query_port_ex to upstream
	+ * Change imm_data to ex.imm_data or ex.invalidate_rkey
	+ * libmlx4: change wc_size from int to uint32_t.
	+ * libmlx4: Print prefer_bf message only in trace mode.
	+ * libmlx4: separate mlx4_post_send to EXP & NON EXP

	- -- Roland Dreier <rolandd@cisco.com> Wed, 12 Mar 2008 10:40:19 -0700
	+ -- Vladimir Sokolovsky <vlad@mellanox.com> Wed, 10 Dec 2014 10:53:10 +0200
	+
	+libmlx4 (1.0.5mlnx1-1) unstable; urgency=low
	+
	+ * resize_cq: fix possible endless loop scanning CQ
	+ * User QP/SRQ in work completion
	+ * libmlx4: Align verbs interface with upstream
	+ * libmlx4: add ibv_exp_reg_mr experimental verb
	+ * libmlx4: Change legacy extended verbs to experimental verbs
	+ * libmlx4: Change legacy extended uverbs to experimental uverbs
	+ * unmap hca_clock_page in mlx4_uninit_context
	+ * Enable contigous pages for Control resources by default
	+ * New experimental verbs for query_port
	+ * Added htobe64 definition which is missing on SLES10
	+ * Fix QoS issues for UD QPs
	+ * Allocate zoeroized memory for CQ
	+ * libmlx4: Change sandy bridge work around algorithm
	+ * libmlx4: add debian to EXTRA_DIST
	+ * libmlx4: add support for "git review" command line gerrit tool
	+ * libmlx4: Fix "make distcheck"
	+ * Add allowed_wc_flags
	+ * libmlx4: Fix valgrind errors.
	+ * Raw IB QP fix
	+ * libmlx4: Change inline receive interface
	+ * Revert "move flow steering to experimental verbs"
	+ * move flow steering to experimental verbs
	+ * libmlx4: resolve segfault on ibv_xsrq_pingpong
	+ * Raw Eth QP - prevent loopback on SRIOV
	+ * libmlx4: remove struct ts and use direct field timestamp
	+ * Fix compilation issue due to shifting bind_mw struct in ib_send_wr
	+ * libmlx4: Add experimental inline receive
	+ * Double check in order to prevent division by zero.
	+ * Add a missing check for a value of a certain variable
	+ * libmlx4 - qp: optimize single segment case around set_data_seg()
	+ * libmlx4 - Inform GCC about hotspot functions so those can be optimized more aggressively.
	+ * libmlx4 - Add branch prediction helpers to qp and cq data path functions.
	+ * libmlx4 - Using unsigned indices allow GCC to generate a bit more efficient code.
	+ * IP based addressing support
	+ * Implementing verbs bind_mw (for binding type 1 memory windows)
	+ * Adding support to post bind (type 2) memory windows
	+ * Adding support to post invalidate messages
	+ * Implementing verbs alloc_mw and dealloc_mw
	+ * Adding work completions that are related to memory windows
	+ * fix incorrect timestamp
	+ * add a workaround for hw bug in hwclock wraparound
	+ * extension verb: mlx4_query_values are reading hwclock
	+ * extension verb: mlx4_query_device_ex
	+ * extension verb: mlx4_create_cq_ex
	+ * implement ibv_poll_cq_ex extension verb
	+ * XRC - move warning to be under trace mode
	+ * XRC - fix leak in legacy flow
	+ * libmlx4 : Globaly avoid spinlocks for multithreaded apps
	+ * Handle missing symbols in Xen server 6.1
	+ * libmlx4: Cache link layer's type in mlx4_context. Caching will allow us to avoid ibv_query_port calls and save time in ibv_create_ah.
	+ * XRC - sync to latest upstream changes
	+ * XRC issues
	+ * libmlx4: XRC binary compat layer
	+
	+ -- Vladimir Sokolovsky <vlad@mellanox.com> Sun, 23 Mar 2014 14:16:10 +0200
	+
	+libmlx4 (1.0.4mlnx2-1) unstable; urgency=low
	+
	+ * libmlx4: Add Cross-channel capability
	+ * libmlx4: Add mlx4_post_task
	+ * libmlx4: Add mlx4_query_device_ex
	+ * libmlx4: Add mlx4_modify_cq
	+ * libmlx4: Support Cross-channel capability in mlx4_create_qp_ex
	+ * libmlx4: Add new fields and opcodes to support Cross-channel
	+ * libmlx4: Remove legacy mverbs code
	+ * libmlx4: Add support for XRC QPs
	+ * libmlx4: contig pages over 4GB
	+ * stall code to be run only on x86
	+ * Implement ibv_create_flow and ibv_destroy_flow
	+ * Revert "Add support for ibv_attach_flow and ibv_detach_flow."
	+ * libmlx4 fix compilation warnings
	+ * Handle 0-length s/g list entries correctly
	+ * libmlx4.spec.in: Fix %files macro
	+ * configure: disable mverbs by default
	+ * libmlx4: verbs extensions breaks MVERBS implementation
	+ * shared_mr support on top of verbs extension
	+ * libmlx4: Infra-structure changes to support verbs extensions
	+ * fixed an issue with definition of container_of
	+ * Revert "verbs extension mechanism based on Sean first patch"
	+
	+ -- Vladimir Sokolovsky <vlad@mellanox.com> Mon, 7 Jan 2013 13:38:10 +0200
	+
	+libmlx4 (1.0.4mlnx1-1) unstable; urgency=low
	+
	+ * New Mellanox release.
	+
	+ -- Vladimir Sokolovsky <vlad@mellanox.com> Mon, 7 Jan 2013 13:38:10 +0200
	+
	+libmlx4 (1.0.4-1) unstable; urgency=low
	+
	+ * New upstream release.
	+ - IBoE multicast support.
	+ * Update maintainer and remove DM-Upload-Allowed now that I'm a DD.
	+
	+ -- Roland Dreier <rbd@debian.org> Wed, 28 Mar 2012 10:31:52 -0700
	+
	+libmlx4 (1.0.3-1) unstable; urgency=low
	+
	+ * New upstream release.
	+ - Add ConnectX-3 support.
	+ - Add IBoE support.
	+ * Since we have plugin in /usr/lib/libibverbs, we need to depend on
	+ libibverbs (>= 1.1.3).
	+
	+ -- Roland Dreier <roland@digitalvampire.org> Wed, 06 Jul 2011 23:54:24 -0700
	+
	+libmlx4 (1.0.2-1) unstable; urgency=low
	+
	+ * New upstream release.
	+ - Fix potential problems running under Valgrind.
	+ - Add support for resize CQ operation.
	+ - Fix other minor bugs.
	+ * Update maintainer and set DM-Upload-Allowed to yes. (Closes: #632108)
	+ * Switch to dpkg-source 3.0 (quilt) format.
	+ * Acknowledge NMU (Closes: #621664).
	+ * Change build system from cdbs to debhelper 7.
	+ * Use libibverbs 1.1.3 feature to move plugin to /usr/lib/libibverbs
	+ to fix multiple problems with a not-exactly-shlib in /usr/lib.
	+ * Add debian/watch file.
	+ * Move -dbg package to section debug.
	+ * Update to Standards-Version: 3.9.2.
	+
	+ -- Roland Dreier <roland@digitalvampire.org> Wed, 06 Jul 2011 13:32:18 -0700
	+
	+libmlx4 (1.0-1.1) unstable; urgency=low
	+
	+ * Non-maintainer upload.
	+ * Don't ship .la files (Closes: #621664).
	+
	+ -- Luk Claes <luk@debian.org> Fri, 01 Jul 2011 19:09:59 +0200

	libmlx4 (1.0-1) unstable; urgency=low

	Index: contrib/ofed/libmlx4/debian/compat
	===================================================================
	--- contrib/ofed/libmlx4/debian/compat
	+++ contrib/ofed/libmlx4/debian/compat
	@@ -1 +1 @@
	-5
	+7
	Index: contrib/ofed/libmlx4/debian/control
	===================================================================
	--- contrib/ofed/libmlx4/debian/control
	+++ contrib/ofed/libmlx4/debian/control
	@@ -1,16 +1,16 @@
	Source: libmlx4
	Priority: extra
	-Maintainer: Roland Dreier <rolandd@cisco.com>
	-Build-Depends: @cdbs@, libibverbs-dev (>= 1.0)
	-Standards-Version: 3.7.3
	+Maintainer: Roland Dreier <rbd@debian.org>
	+Build-Depends: debhelper (>= 7.0.50~), dpkg-dev (>= 1.13.19), libibverbs-dev (>= 1.1.3)
	+Standards-Version: 3.9.2
	Section: libs
	Homepage: http://www.openfabrics.org/

	Package: libmlx4-1
	Section: libs
	Architecture: any
	-Depends: ${shlibs:Depends}, ${misc:Depends}
	-Description: A userspace driver for Mellanox ConnectX InfiniBand HCAs
	+Depends: ${shlibs:Depends}, ${misc:Depends}, libibverbs1 (>= 1.1.3)
	+Description: Userspace driver for Mellanox ConnectX InfiniBand HCAs
	libmlx4 is a device-specific driver for Mellanox ConnectX InfiniBand
	host channel adapters (HCAs) for the libibverbs library. This allows
	userspace processes to access Mellanox HCA hardware directly with
	@@ -32,7 +32,7 @@
	directly to an application, which may be useful for debugging.

	Package: libmlx4-1-dbg
	-Section: libdevel
	+Section: debug
	Priority: extra
	Architecture: any
	Depends: ${misc:Depends}, libmlx4-1 (= ${binary:Version})
	Index: contrib/ofed/libmlx4/debian/libmlx4-1.install
	===================================================================
	--- contrib/ofed/libmlx4/debian/libmlx4-1.install
	+++ contrib/ofed/libmlx4/debian/libmlx4-1.install
	@@ -1,2 +1,2 @@
	-usr/lib/libmlx4-rdmav2.so
	+usr/lib/libmlx4-rdmav2.so /usr/lib/libibverbs/
	etc/libibverbs.d/mlx4.driver
	Index: contrib/ofed/libmlx4/debian/libmlx4-dev.install
	===================================================================
	--- contrib/ofed/libmlx4/debian/libmlx4-dev.install
	+++ contrib/ofed/libmlx4/debian/libmlx4-dev.install
	@@ -1 +1 @@
	-usr/lib/libmlx4.{a,la}
	+usr/lib/libmlx4.a
	Index: contrib/ofed/libmlx4/debian/rules
	===================================================================
	--- contrib/ofed/libmlx4/debian/rules
	+++ contrib/ofed/libmlx4/debian/rules
	@@ -1,8 +1,10 @@
	#!/usr/bin/make -f
	# -- mode: makefile; coding: utf-8 --

	-DEB_DH_INSTALL_SOURCEDIR := debian/tmp
	-DEB_AUTO_UPDATE_LIBTOOL := post
	+%:
	+ dh $@

	-include /usr/share/cdbs/1/rules/debhelper.mk
	-include /usr/share/cdbs/1/class/autotools.mk
	+override_dh_strip:
	+ dh_strip --dbg-package=libmlx4-1-dbg
	+
	+override_dh_makeshlibs:
	Index: contrib/ofed/libmlx4/libmlx4.spec.in
	===================================================================
	--- contrib/ofed/libmlx4/libmlx4.spec.in
	+++ contrib/ofed/libmlx4/libmlx4.spec.in
	@@ -1,15 +1,27 @@
	+%{!?_with_valgrind: %define _with_valgrind 0}
	+%{!?_disable_valgrind: %define _disable_valgrind 0}
	+
	+%if 0%{?rhel} == 6
	+%if 0%{_disable_valgrind} == 0
	+%define _with_valgrind 1
	+%endif
	+%endif
	+
	Name: libmlx4
	-Version: 1.0
	-Release: 2%{?dist}
	+Version: 1.0.6mlnx1
	+Release: 1%{?dist}
	Summary: Mellanox ConnectX InfiniBand HCA Userspace Driver

	Group: System Environment/Libraries
	License: GPLv2 or BSD
	Url: http://openfabrics.org/
	-Source: http://openfabrics.org/downloads/mlx4/libmlx4-1.0.tar.gz
	+Source: http://openfabrics.org/downloads/mlx4/libmlx4-%{version}.tar.gz
	BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)

	-BuildRequires: libibverbs-devel >= 1.1-0.1.rc2
	+BuildRequires: libibverbs-devel >= 1.1.6mlnx2
	+%if %{_with_valgrind}
	+BuildRequires: valgrind-devel
	+%endif

	%description
	libmlx4 provides a device-specific userspace driver for Mellanox
	@@ -29,12 +41,24 @@
	%setup -q -n %{name}-@VERSION@

	%build
	-%configure
	+%if %{_with_valgrind}
	+%configure %{?configure_options} --libdir=%{_libdir}/mlnx_ofed/valgrind --with-valgrind
	+make %{?_smp_mflags}
	+make DESTDIR=$RPM_BUILD_DIR/%{name}-%{version}/valgrind install
	+rm -f $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind/.a
	+make clean
	+%endif
	+
	+%configure %{?configure_options}
	make %{?_smp_mflags}

	%install
	rm -rf $RPM_BUILD_ROOT
	make DESTDIR=%{buildroot} install
	+%if %{_with_valgrind}
	+mkdir -p %{buildroot}/%{_libdir}/mlnx_ofed
	+cp -a $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind %{buildroot}/%{_libdir}/mlnx_ofed
	+%endif
	# remove unpackaged files from the buildroot
	rm -f $RPM_BUILD_ROOT%{_libdir}/*.la $RPM_BUILD_ROOT%{_libdir}/libmlx4.so

	@@ -43,15 +67,34 @@

	%files
	%defattr(-,root,root,-)
	-%{_libdir}/libmlx4-rdmav2.so
	+%{_libdir}/libmlx4*.so
	+%if %{_with_valgrind}
	+%{_libdir}/mlnx_ofed/valgrind/libmlx4*.so
	+%endif
	%{_sysconfdir}/libibverbs.d/mlx4.driver
	%doc AUTHORS COPYING README

	%files devel
	%defattr(-,root,root,-)
	-%{_libdir}/libmlx4.a
	+%{_libdir}/libmlx4*.a

	%changelog
	+* Mon Mar 28 2012 Roland Dreier <roland@digitalvampire.org> - 1.0.4-1
	+- New upstream release
	+
	+* Mon Mar 26 2012 Roland Dreier <roland@digitalvampire.org> - 1.0.3-1
	+- New upstream release
	+
	+* Wed Jul 6 2011 Roland Dreier <roland@digitalvampire.org> - 1.0.2-1
	+- New upstream release
	+
	+* Wed Jun 17 2009 Roland Dreier <rdreier@cisco.com> - 1.0.1-1
	+- New upstream release
	+- Change openib.org URLs to openfabrics.org URLs
	+
	+* Wed Feb 25 2009 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1.0-3
	+- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild
	+
	* Sun Jan 27 2008 Roland Dreier <rdreier@cisco.com> - 1.0-2
	- Spec file cleanups, based on Fedora review: don't mark
	libmlx4.driver as a config file, since it is not user modifiable,
	Index: contrib/ofed/libmlx4/src/bitmap.h
	===================================================================
	--- /dev/null
	+++ contrib/ofed/libmlx4/src/bitmap.h
	@@ -0,0 +1,303 @@
	+/*
	+ * Copyright (c) 2000, 2011 Mellanox Technology Inc. All rights reserved.
	+ *
	+ * This software is available to you under a choice of one of two
	+ * licenses. You may choose to be licensed under the terms of the GNU
	+ * General Public License (GPL) Version 2, available from the file
	+ * COPYING in the main directory of this source tree, or the
	+ * OpenIB.org BSD license below:
	+ *
	+ * Redistribution and use in source and binary forms, with or
	+ * without modification, are permitted provided that the following
	+ * conditions are met:
	+ *
	+ * - Redistributions of source code must retain the above
	+ * copyright notice, this list of conditions and the following
	+ * disclaimer.
	+ *
	+ * - Redistributions in binary form must reproduce the above
	+ * copyright notice, this list of conditions and the following
	+ * disclaimer in the documentation and/or other materials
	+ * provided with the distribution.
	+ *
	+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	+ * SOFTWARE.
	+ */
	+
	+#ifndef BITMAP_H
	+#define BITMAP_H
	+
	+#include <stdlib.h>
	+#include <stdio.h>
	+#include <pthread.h>
	+#include <string.h>
	+#include <sys/types.h>
	+#include <sys/ipc.h>
	+#include <sys/shm.h>
	+#include <sys/mman.h>
	+#include <errno.h>
	+
	+#ifndef min
	+#define min(a, b) \
	+ ({ typeof(a) _a = (a); \
	+ typeof(b) _b = (b); \
	+ _a < _b ? _a : _b; })
	+#endif
	+
	+/* Only ia64 requires this */
	+#ifdef __ia64__
	+#define MLX4_SHM_ADDR (void *)(0x8000000000000000UL)
	+#define MLX4_SHMAT_FLAGS (SHM_RND)
	+#else
	+#define MLX4_SHM_ADDR (void *)(0x0UL)
	+#define MLX4_SHMAT_FLAGS (0)
	+#endif
	+
	+struct __dummy_h { unsigned long a[100]; };
	+#define MLX4_ADDR ((struct __dummy_h ) addr)
	+#define MLX4_CONST_ADDR ((const struct __dummy_h ) addr)
	+
	+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
	+#define BITS_PER_BYTE 8
	+#define BITS_PER_WORD (BITS_PER_BYTE * sizeof(uint32_t))
	+#define BITS_TO_WORDS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(uint32_t))
	+
	+#ifndef HPAGE_SIZE
	+#define HPAGE_SIZE (2UL10241024)
	+#endif
	+
	+#define MLX4_SHM_LENGTH (HPAGE_SIZE)
	+#define MLX4_Q_CHUNK_SIZE 32768
	+#define MLX4_SHM_NUM_REGION 64
	+
	+struct mlx4_bitmap {
	+ uint32_t last;
	+ uint32_t top;
	+ uint32_t max;
	+ uint32_t avail;
	+ uint32_t mask;
	+ struct mlx4_spinlock lock;
	+ uint32_t *table;
	+};
	+
	+inline unsigned long mlx4_ffz(uint32_t word)
	+{
	+ return __builtin_ffs(~word) - 1;
	+}
	+
	+inline void mlx4_set_bit(unsigned int nr, uint32_t *addr)
	+{
	+
	+ addr[(nr / BITS_PER_WORD)]
	+ \|= (1 << (nr % BITS_PER_WORD));
	+
	+
	+}
	+
	+inline void mlx4_clear_bit(unsigned int nr, uint32_t *addr)
	+{
	+ addr[(nr / BITS_PER_WORD)]
	+ &= ~(1 << (nr % BITS_PER_WORD));
	+}
	+
	+inline int mlx4_test_bit(unsigned int nr, const uint32_t *addr)
	+{
	+ return !!(addr[(nr / BITS_PER_WORD)]
	+ & (1 << (nr % BITS_PER_WORD)));
	+}
	+
	+inline uint32_t mlx4_find_first_zero_bit(const uint32_t *addr,
	+ uint32_t size)
	+{
	+ const uint32_t *p = addr;
	+ uint32_t result = 0;
	+ uint32_t tmp;
	+
	+ while (size & ~(BITS_PER_WORD - 1)) {
	+ tmp = *(p++);
	+ if (~tmp)
	+ goto found;
	+ result += BITS_PER_WORD;
	+ size -= BITS_PER_WORD;
	+ }
	+ if (!size)
	+ return result;
	+
	+ tmp = (*p) \| (~0UL << size);
	+ if (tmp == (uint32_t)~0UL) /* Are any bits zero? */
	+ return result + size; /* Nope. */
	+found:
	+ return result + mlx4_ffz(tmp);
	+}
	+
	+int mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
	+{
	+ uint32_t obj;
	+ int ret;
	+
	+ mlx4_spin_lock(&bitmap->lock);
	+
	+ obj = mlx4_find_first_zero_bit(bitmap->table, bitmap->max);
	+ if (obj < bitmap->max) {
	+ mlx4_set_bit(obj, bitmap->table);
	+ bitmap->last = (obj + 1);
	+ if (bitmap->last == bitmap->max)
	+ bitmap->last = 0;
	+ obj \|= bitmap->top;
	+ ret = obj;
	+ } else
	+ ret = -1;
	+
	+ if (ret != -1)
	+ --bitmap->avail;
	+
	+ mlx4_spin_unlock(&bitmap->lock);
	+
	+ return ret;
	+}
	+
	+static inline uint32_t find_aligned_range(uint32_t *bitmap,
	+ uint32_t start, uint32_t nbits,
	+ int len, int alignment)
	+{
	+ uint32_t end, i;
	+
	+again:
	+ start = align(start, alignment);
	+
	+ while ((start < nbits) && mlx4_test_bit(start, bitmap))
	+ start += alignment;
	+
	+ if (start >= nbits)
	+ return -1;
	+
	+ end = start + len;
	+ if (end > nbits)
	+ return -1;
	+
	+ for (i = start + 1; i < end; i++) {
	+ if (mlx4_test_bit(i, bitmap)) {
	+ start = i + 1;
	+ goto again;
	+ }
	+ }
	+
	+ return start;
	+}
	+
	+static inline int mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
	+ int align)
	+{
	+ uint32_t obj;
	+ int ret, i;
	+
	+ if (cnt == 1 && align == 1)
	+ return mlx4_bitmap_alloc(bitmap);
	+
	+ if (cnt > bitmap->max)
	+ return -1;
	+
	+ mlx4_spin_lock(&bitmap->lock);
	+
	+ obj = find_aligned_range(bitmap->table, bitmap->last,
	+ bitmap->max, cnt, align);
	+ if (obj >= bitmap->max) {
	+ bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
	+ obj = find_aligned_range(bitmap->table, 0, bitmap->max,
	+ cnt, align);
	+ }
	+
	+ if (obj < bitmap->max) {
	+ for (i = 0; i < cnt; i++)
	+ mlx4_set_bit(obj + i, bitmap->table);
	+ if (obj == bitmap->last) {
	+ bitmap->last = (obj + cnt);
	+ if (bitmap->last >= bitmap->max)
	+ bitmap->last = 0;
	+ }
	+ obj \|= bitmap->top;
	+ ret = obj;
	+ } else
	+ ret = -1;
	+
	+ if (ret != -1)
	+ bitmap->avail -= cnt;
	+
	+ mlx4_spin_unlock(&bitmap->lock);
	+
	+ return obj;
	+}
	+
	+static inline void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, uint32_t obj,
	+ int cnt)
	+{
	+ int i;
	+
	+ obj &= bitmap->max - 1;
	+
	+ mlx4_spin_lock(&bitmap->lock);
	+ for (i = 0; i < cnt; i++)
	+ mlx4_clear_bit(obj + i, bitmap->table);
	+ bitmap->last = min(bitmap->last, obj);
	+ bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
	+ bitmap->avail += cnt;
	+ mlx4_spin_unlock(&bitmap->lock);
	+}
	+
	+static inline int is_bitmap_empty(struct mlx4_bitmap *bitmap)
	+{
	+ int ret;
	+
	+ mlx4_spin_lock(&bitmap->lock);
	+ ret = (bitmap->avail == bitmap->max) ? 1 : 0;
	+ mlx4_spin_unlock(&bitmap->lock);
	+
	+ return ret;
	+}
	+
	+static inline int is_bitmap_avail(struct mlx4_bitmap *bitmap)
	+{
	+ int ret;
	+
	+ mlx4_spin_lock(&bitmap->lock);
	+ ret = (bitmap->avail > 0) ? 1 : 0;
	+ mlx4_spin_unlock(&bitmap->lock);
	+
	+ return ret;
	+}
	+
	+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, uint32_t num, uint32_t mask)
	+{
	+ bitmap->last = 0;
	+ bitmap->top = 0;
	+ bitmap->max = bitmap->avail = num;
	+ bitmap->mask = mask;
	+ bitmap->avail = bitmap->max;
	+ mlx4_spinlock_init(&bitmap->lock, !mlx4_single_threaded);
	+ bitmap->table = malloc(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t));
	+
	+ if (!bitmap->table)
	+ return -ENOMEM;
	+ memset((void *)bitmap->table, 0,
	+ (int)(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t)));
	+ return 0;
	+}
	+
	+inline void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
	+{
	+ if (bitmap->table)
	+ free(bitmap->table);
	+}
	+
	+static inline void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, uint32_t obj)
	+{
	+ mlx4_bitmap_free_range(bitmap, obj, 1);
	+}
	+
	+#endif
	Index: contrib/ofed/libmlx4/src/buf.c
	===================================================================
	--- contrib/ofed/libmlx4/src/buf.c
	+++ contrib/ofed/libmlx4/src/buf.c
	@@ -36,9 +36,21 @@

	#include <stdlib.h>
	#include <errno.h>
	+#include <signal.h>
	#include <sys/mman.h>
	+#include <sys/ipc.h>
	+#include <sys/shm.h>
	+#include <stdio.h>

	#include "mlx4.h"
	+#include "bitmap.h"
	+
	+struct mlx4_hugetlb_mem {
	+ int shmid;
	+ char *shmaddr;
	+ struct mlx4_bitmap bitmap;
	+ struct list_head list;
	+};

	#if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE))

	@@ -59,13 +71,154 @@

	#endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */

	+void mlx4_hugetlb_mem_free(struct mlx4_hugetlb_mem *hmem)
	+{
	+ mlx4_bitmap_cleanup(&hmem->bitmap);
	+
	+ if (shmdt((const void *)hmem->shmaddr) != 0) {
	+ if (mlx4_trace)
	+ perror("Detach shm failure");
	+ }
	+ free(hmem);
	+}
	+static void mlx4_free_buf_huge_ex(struct mlx4_context *mctx,
	+ struct mlx4_buf *buf,
	+ int do_fork)
	+{
	+ struct mlx4_hugetlb_mem *hmem;
	+
	+ if (do_fork)
	+ ibv_dofork_range(buf->buf, buf->length);
	+
	+ if (buf->hmem == NULL) {
	+ if (mlx4_trace)
	+ perror("No hugetlb mem");
	+ return;
	+ }
	+
	+ hmem = (struct mlx4_hugetlb_mem *) buf->hmem;
	+ mlx4_spin_lock(&mctx->hugetlb_lock);
	+ mlx4_bitmap_free_range(&hmem->bitmap, buf->base,
	+ buf->length/MLX4_Q_CHUNK_SIZE);
	+
	+ if (is_bitmap_empty(&hmem->bitmap)) {
	+ list_del(&hmem->list);
	+ mlx4_hugetlb_mem_free(hmem);
	+ }
	+ mlx4_spin_unlock(&mctx->hugetlb_lock);
	+}
	+
	+void mlx4_free_buf_huge(struct mlx4_context mctx, struct mlx4_buf buf)
	+{
	+ mlx4_free_buf_huge_ex(mctx, buf, 1);
	+}
	+
	+#ifndef SHM_HUGETLB
	+#define SHM_HUGETLB 0
	+#endif
	+
	+struct mlx4_hugetlb_mem *mxl4_hugetlb_mem_alloc(size_t size)
	+{
	+ struct mlx4_hugetlb_mem *hmem;
	+ size_t shm_len;
	+
	+ hmem = malloc(sizeof(*hmem));
	+ if (!hmem)
	+ return NULL;
	+
	+ shm_len = (size > MLX4_SHM_LENGTH) ? align(size, MLX4_SHM_LENGTH) :
	+ MLX4_SHM_LENGTH;
	+ hmem->shmid = shmget(IPC_PRIVATE, shm_len,
	+ SHM_HUGETLB \| IPC_CREAT \| SHM_R \| SHM_W);
	+ if (hmem->shmid < 0) {
	+ if (mlx4_trace)
	+ perror("shmget");
	+ free(hmem);
	+ return NULL;
	+ }
	+
	+ hmem->shmaddr = shmat(hmem->shmid, MLX4_SHM_ADDR, MLX4_SHMAT_FLAGS);
	+ if (hmem->shmaddr == (char *)-1) {
	+ if (mlx4_trace)
	+ perror("Shared memory attach failure");
	+ shmctl(hmem->shmid, IPC_RMID, NULL);
	+ free(hmem);
	+ return NULL;
	+ }
	+
	+ if (mlx4_bitmap_init(&hmem->bitmap, shm_len/MLX4_Q_CHUNK_SIZE,
	+ shm_len/MLX4_Q_CHUNK_SIZE - 1)) {
	+ if (mlx4_trace)
	+ perror("mlx4_bitmap_init");
	+ mlx4_hugetlb_mem_free(hmem);
	+ return NULL;
	+ }
	+
	+ /* Marked to destroy when process detaches from shmget segment */
	+ shmctl(hmem->shmid, IPC_RMID, NULL);
	+
	+ return hmem;
	+}
	+
	+
	+int mlx4_alloc_prefered_buf(struct mlx4_context *mctx,
	+ struct mlx4_buf *buf,
	+ size_t size, int page_size,
	+ enum mlx4_alloc_type alloc_type,
	+ const char *component)
	+{
	+ int ret = 1;
	+
	+ buf->hmem = NULL;
	+ /* Fallback mechanism is used below:
	+ priority is: huge pages , contig pages, default allocation */
	+ if (alloc_type == MLX4_ALLOC_TYPE_HUGE \|\|
	+ alloc_type == MLX4_ALLOC_TYPE_PREFER_HUGE \|\|
	+ alloc_type == MLX4_ALLOC_TYPE_ALL) {
	+ ret = mlx4_alloc_buf_huge(mctx, buf,
	+ size,
	+ page_size);
	+ if (!ret)
	+ return 0;
	+
	+ /* Checking whether HUGE is forced */
	+ if (alloc_type == MLX4_ALLOC_TYPE_HUGE)
	+ return -1;
	+ if (mlx4_trace)
	+ printf(PFX "Huge mode allocation has failed,fallback to %s mode\n",
	+ MLX4_ALLOC_TYPE_ALL ? "contig" : "default");
	+
	+ }
	+
	+ if (alloc_type == MLX4_ALLOC_TYPE_CONTIG \|\|
	+ alloc_type == MLX4_ALLOC_TYPE_PREFER_CONTIG \|\|
	+ alloc_type == MLX4_ALLOC_TYPE_ALL) {
	+ ret = mlx4_alloc_buf_contig(mctx, buf,
	+ size,
	+ page_size,
	+ component, NULL);
	+ if (!ret)
	+ return 0;
	+
	+ /* Checking whether CONTIG is forced */
	+ if (alloc_type == MLX4_ALLOC_TYPE_CONTIG)
	+ return -1;
	+ if (mlx4_trace)
	+ printf(PFX "Contig mode allocation has failed,fallback to default mode\n");
	+ }
	+
	+ return mlx4_alloc_buf(buf, size, page_size);
	+
	+}
	+
	+
	int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
	{
	int ret;

	buf->length = align(size, page_size);
	buf->buf = mmap(NULL, buf->length, PROT_READ \| PROT_WRITE,
	- MAP_PRIVATE \| MAP_ANON, -1, 0);
	+ MAP_PRIVATE \| MAP_ANONYMOUS, -1, 0);
	if (buf->buf == MAP_FAILED)
	return errno;

	@@ -78,6 +231,271 @@

	void mlx4_free_buf(struct mlx4_buf *buf)
	{
	- ibv_dofork_range(buf->buf, buf->length);
	- munmap(buf->buf, buf->length);
	+ if (buf->length) {
	+ ibv_dofork_range(buf->buf, buf->length);
	+ munmap(buf->buf, buf->length);
	+ }
	+}
	+
	+/* This function computes log2(v) rounded up.
	+* We don't want to have a dependency to libm which exposes ceil & log2 APIs.
	+* Code was written based on public domain code:
	+ URL: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog.
	+*/
	+static uint32_t mlx4_get_block_order(uint32_t v)
	+{
	+ static const uint32_t bits_arr[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
	+ static const uint32_t shift_arr[] = {1, 2, 4, 8, 16};
	+ int i;
	+ uint32_t input_val = v;
	+
	+ register uint32_t r = 0;/* result of log2(v) will go here */
	+ for (i = 4; i >= 0; i--) {
	+
	+ if (v & bits_arr[i]) {
	+ v >>= shift_arr[i];
	+ r \|= shift_arr[i];
	+ }
	+ }
	+ /* Rounding up if required */
	+ r += !!(input_val & ((1 << r) - 1));
	+
	+ return r;
	+}
	+
	+
	+static int mlx4_finalize_contiguous_alloc(struct mlx4_buf *buf,
	+ void *addr,
	+ size_t length)
	+{
	+ if (ibv_dontfork_range(addr, length)) {
	+ munmap(addr, length);
	+ return 1;
	+ }
	+
	+ /* We hook addr & length also internally for further
	+ use via dreg_mr. On ibv_mr returned to user length or address may
	+ be different than the allocated length or address as of alignment
	+ issues.
	+ */
	+ buf->buf = addr;
	+ buf->length = length;
	+ return 0;
	+
	+}
	+
	+
	+void mlx4_get_alloc_type(struct ibv_context context, const char component,
	+ enum mlx4_alloc_type *alloc_type,
	+ enum mlx4_alloc_type default_alloc_type)
	+
	+{
	+ char env_value[VERBS_MAX_ENV_VAL];
	+ char name_buff[128];
	+
	+ sprintf(name_buff, "%s_ALLOC_TYPE", component);
	+
	+ /* First set defaults */
	+ *alloc_type = default_alloc_type;
	+
	+ if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
	+ if (!strcasecmp(env_value, "ANON"))
	+ *alloc_type = MLX4_ALLOC_TYPE_ANON;
	+ else if (!strcasecmp(env_value, "HUGE"))
	+ *alloc_type = MLX4_ALLOC_TYPE_HUGE;
	+ else if (!strcasecmp(env_value, "CONTIG"))
	+ *alloc_type = MLX4_ALLOC_TYPE_CONTIG;
	+ else if (!strcasecmp(env_value, "PREFER_CONTIG"))
	+ *alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
	+ else if (!strcasecmp(env_value, "PREFER_HUGE"))
	+ *alloc_type = MLX4_ALLOC_TYPE_PREFER_HUGE;
	+ else if (!strcasecmp(env_value, "ALL"))
	+ *alloc_type = MLX4_ALLOC_TYPE_ALL;
	+ }
	+
	+ return;
	+}
	+
	+
	+static void mlx4_alloc_get_env_info(struct ibv_context *context,
	+ int *max_log2_contig_block_size,
	+ int *min_log2_contig_block_size,
	+ const char *component)
	+
	+{
	+ char env_value[VERBS_MAX_ENV_VAL];
	+ int value;
	+ char name_buff[128];
	+
	+ /* First set defaults */
	+ *max_log2_contig_block_size = MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE;
	+ *min_log2_contig_block_size = MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE;
	+
	+ sprintf(name_buff, "%s_MAX_LOG2_CONTIG_BSIZE", component);
	+ if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
	+ value = atoi(env_value);
	+ if (value <= MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE &&
	+ value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE)
	+ *max_log2_contig_block_size = value;
	+ else
	+ fprintf(stderr,
	+ "Invalid value %d for %s\n",
	+ value, name_buff);
	+ }
	+ sprintf(name_buff, "%s_MIN_LOG2_CONTIG_BSIZE", component);
	+ if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
	+ value = atoi(env_value);
	+ if (value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE &&
	+ value <= *max_log2_contig_block_size)
	+ *min_log2_contig_block_size = value;
	+ else
	+ fprintf(stderr,
	+ "Invalid value %d for %s\n",
	+ value, name_buff);
	+ }
	+ return;
	}
	+
	+
	+
	+int mlx4_alloc_buf_contig(struct mlx4_context *mctx,
	+ struct mlx4_buf *buf, size_t size,
	+ int page_size,
	+ const char component, void req_addr)
	+{
	+ void *addr = NULL;
	+ int block_size_exp;
	+ int max_log2_contig_block_size;
	+ int min_log2_contig_block_size;
	+ int mmap_flags = MAP_SHARED;
	+ void *act_addr = NULL;
	+ size_t act_size = size;
	+
	+ struct ibv_context *context = &(mctx->ibv_ctx);
	+
	+ mlx4_alloc_get_env_info(&mctx->ibv_ctx,
	+ &max_log2_contig_block_size,
	+ &min_log2_contig_block_size,
	+ component);
	+
	+ /* Checking that we don't pass max block size */
	+ if (size >= (1 << max_log2_contig_block_size))
	+ block_size_exp = max_log2_contig_block_size;
	+ else
	+ block_size_exp = mlx4_get_block_order(size);
	+
	+ if (req_addr) {
	+ act_addr = (void *)((uintptr_t)req_addr & ~((uintptr_t)page_size - 1));
	+ act_size += (size_t)((uintptr_t)req_addr - (uintptr_t)act_addr);
	+ mmap_flags \|= MAP_FIXED;
	+ }
	+
	+ do {
	+ /* The second parameter holds the total required length for
	+ this contiguous allocation aligned to page size.
	+ When calling mmap the last offset parameter
	+ should be a multiple of the page size and holds:
	+ 1) Indication that we are in that mode of
	+ allocation contiguous memory (value #2)
	+ 2) The required size of each block.
	+ To enable future actions on mmap we
	+ use the last 3 bits of the offset parameter
	+ as the command identifier.
	+ */
	+ addr = mmap(act_addr, act_size,
	+ PROT_WRITE \| PROT_READ, mmap_flags,
	+ context->cmd_fd,
	+ page_size *
	+ (MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD +
	+ (block_size_exp << MLX4_MMAP_CMD_BITS)));
	+
	+ /* On a failure MAP_FAILED (that is, (void ) -1) is returned/
	+ if (addr != MAP_FAILED)
	+ break;
	+
	+ /* We failed - set addr to NULL and checks whether
	+ a retry is relevant.
	+ * If kernel doesn't support this command as of
	+ compatibility issues we'll also get EINVAL.
	+ */
	+ addr = NULL;
	+ if (errno == EINVAL)
	+ break;
	+
	+ /* Retring asking for less contiguous pages per block */
	+ block_size_exp -= 1;
	+ } while (block_size_exp >= min_log2_contig_block_size);
	+
	+ if (!addr)
	+ return 1;
	+
	+ /* All was ok we'll make final steps to have this addr ready*/
	+ return mlx4_finalize_contiguous_alloc(buf, addr, act_size);
	+}
	+
	+int mlx4_alloc_buf_huge(struct mlx4_context mctx, struct mlx4_buf buf,
	+ size_t size, int page_size)
	+{
	+ struct mlx4_hugetlb_mem hmem, tmp_hmem;
	+ int found = 0;
	+ int ret = 0;
	+ LIST_HEAD(slist);
	+
	+ buf->length = align(size, MLX4_Q_CHUNK_SIZE);
	+
	+ mlx4_spin_lock(&mctx->hugetlb_lock);
	+ list_for_each_entry_safe(hmem, tmp_hmem, &mctx->hugetlb_list, list) {
	+ if (is_bitmap_avail(&hmem->bitmap)) {
	+ buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap,
	+ buf->length/MLX4_Q_CHUNK_SIZE, 1);
	+ if (buf->base == -1)
	+ continue;
	+ else {
	+ buf->hmem = (void *)hmem;
	+ found = 1;
	+ break;
	+ }
	+ }
	+ }
	+ mlx4_spin_unlock(&mctx->hugetlb_lock);
	+
	+ if (!found) {
	+ int avail;
	+
	+ hmem = mxl4_hugetlb_mem_alloc(buf->length);
	+ if (hmem == NULL)
	+ return -1;
	+
	+ buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap,
	+ buf->length/MLX4_Q_CHUNK_SIZE, 1);
	+ if (buf->base == -1) {
	+ if (mlx4_trace)
	+ perror("mlx4_bitmap_alloc_range");
	+ mlx4_hugetlb_mem_free(hmem);
	+ return -1;
	+ }
	+
	+ buf->hmem = (void *)hmem;
	+
	+ avail = is_bitmap_avail(&hmem->bitmap);
	+ mlx4_spin_lock(&mctx->hugetlb_lock);
	+ if (avail)
	+ list_add(&hmem->list, &mctx->hugetlb_list);
	+ else
	+ list_add_tail(&hmem->list, &mctx->hugetlb_list);
	+ mlx4_spin_unlock(&mctx->hugetlb_lock);
	+ }
	+
	+ buf->buf = hmem->shmaddr + (buf->base * MLX4_Q_CHUNK_SIZE);
	+
	+ ret = ibv_dontfork_range(buf->buf, buf->length);
	+ if (ret) {
	+ mlx4_free_buf_huge_ex(mctx, buf, 0);
	+ buf->hmem = NULL;
	+ if (mlx4_trace)
	+ perror("ibv_dontfork_range");
	+ }
	+
	+ return ret;
	+}
	+
	Index: contrib/ofed/libmlx4/src/cq.c
	===================================================================
	--- contrib/ofed/libmlx4/src/cq.c
	+++ contrib/ofed/libmlx4/src/cq.c
	@@ -47,6 +47,8 @@
	#include "mlx4.h"
	#include "doorbell.h"

	+int mlx4_stall_num_loop = 300;
	+
	enum {
	MLX4_CQ_DOORBELL = 0x20
	};
	@@ -61,8 +63,18 @@
	#define MLX4_CQ_DB_REQ_NOT (2 << 24)

	enum {
	+ MLX4_CQE_L2_TUNNEL_IPV4 = 1 << 25,
	+ MLX4_CQE_L2_TUNNEL_L4_CSUM = 1 << 26,
	+ MLX4_CQE_L2_TUNNEL = 1 << 27,
	+ MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29,
	+ MLX4_CQE_L2_TUNNEL_IPOK = 1 << 31,
	+ MLX4_CQE_QPN_MASK = 0xffffff,
	+};
	+
	+enum {
	MLX4_CQE_OWNER_MASK = 0x80,
	MLX4_CQE_IS_SEND_MASK = 0x40,
	+ MLX4_CQE_INL_SCATTER_MASK = 0x20,
	MLX4_CQE_OPCODE_MASK = 0x1f
	};

	@@ -82,23 +94,50 @@
	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22,
	};

	+enum {
	+ MLX4_CQE_STATUS_L4_CSUM = 1 << 2,
	+ MLX4_CQE_STATUS_IPV4 = 1 << 6,
	+ MLX4_CQE_STATUS_IPV4F = 1 << 7,
	+ MLX4_CQE_STATUS_IPV6 = 1 << 8,
	+ MLX4_CQE_STATUS_IPV4OPT = 1 << 9,
	+ MLX4_CQE_STATUS_TCP = 1 << 10,
	+ MLX4_CQE_STATUS_UDP = 1 << 11,
	+ MLX4_CQE_STATUS_IPOK = 1 << 12
	+};
	+
	+
	struct mlx4_cqe {
	- uint32_t my_qpn;
	+ uint32_t vlan_my_qpn;
	uint32_t immed_rss_invalid;
	uint32_t g_mlpath_rqpn;
	- uint8_t sl;
	- uint8_t reserved1;
	- uint16_t rlid;
	- uint32_t reserved2;
	+ union {
	+ struct {
	+ union {
	+ struct {
	+ uint16_t sl_vid;
	+ uint16_t rlid;
	+ };
	+ uint32_t timestamp_16_47;
	+ };
	+ uint16_t status;
	+ uint8_t reserved2;
	+ uint8_t badfcs_enc;
	+ };
	+ struct {
	+ uint16_t reserved4;
	+ uint8_t smac[6];
	+ };
	+ };
	uint32_t byte_cnt;
	uint16_t wqe_index;
	uint16_t checksum;
	- uint8_t reserved3[3];
	+ uint8_t reserved5[1];
	+ uint16_t timestamp_0_15;
	uint8_t owner_sr_opcode;
	-};
	+} __attribute__((packed));

	struct mlx4_err_cqe {
	- uint32_t my_qpn;
	+ uint32_t vlan_my_qpn;
	uint32_t reserved1[5];
	uint16_t wqe_index;
	uint8_t vendor_err;
	@@ -118,7 +157,7 @@
	struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe;

	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
	- !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : tcqe;
	+ !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe;
	}

	static struct mlx4_cqe next_cqe_sw(struct mlx4_cq cq)
	@@ -126,18 +165,13 @@
	return get_sw_cqe(cq, cq->cons_index);
	}

	-static void update_cons_index(struct mlx4_cq *cq)
	-{
	- *cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
	-}
	-
	static void mlx4_handle_error_cqe(struct mlx4_err_cqe cqe, struct ibv_wc wc)
	{
	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR)
	printf(PFX "local QP operation err "
	"(QPN %06x, WQE index %x, vendor syndrome %02x, "
	"opcode = %02x)\n",
	- htonl(cqe->my_qpn), htonl(cqe->wqe_index),
	+ htonl(cqe->vlan_my_qpn), htonl(cqe->wqe_index),
	cqe->vendor_err,
	cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);

	@@ -191,22 +225,34 @@

	static int mlx4_poll_one(struct mlx4_cq *cq,
	struct mlx4_qp **cur_qp,
	- struct ibv_wc *wc)
	+ struct ibv_exp_wc *wc,
	+ uint32_t wc_size, int is_exp)
	{
	struct mlx4_wq *wq;
	struct mlx4_cqe *cqe;
	- struct mlx4_srq *srq = NULL;
	+ struct mlx4_srq *srq;
	uint32_t qpn;
	- uint32_t srqn;
	uint32_t g_mlpath_rqpn;
	uint16_t wqe_index;
	int is_error;
	int is_send;
	-
	+ int size;
	+ int left;
	+ int list_len;
	+ int i;
	+ struct mlx4_inlr_rbuff *rbuffs;
	+ uint8_t *sbuff;
	+ int timestamp_en = !!(cq->creation_flags &
	+ IBV_EXP_CQ_TIMESTAMP);
	+ uint64_t exp_wc_flags = 0;
	+ uint64_t wc_flags = 0;
	cqe = next_cqe_sw(cq);
	if (!cqe)
	return CQ_EMPTY;

	+ if (cq->cqe_size == 64)
	+ ++cqe;
	+
	++cq->cons_index;

	VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
	@@ -217,36 +263,44 @@
	*/
	rmb();

	- qpn = ntohl(cqe->my_qpn);
	+ qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
	+ wc->qp_num = qpn;

	is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
	+
	+ /* include checksum as work around for calc opcode */
	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
	- MLX4_CQE_OPCODE_ERROR;
	+ MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff);

	- if (qpn & MLX4_XRC_QPN_BIT && !is_send) {
	- srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff;
	+ if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) {
	/*
	- * We do not have to take the XRC SRQ table lock here,
	- * because CQs will be locked while XRC SRQs are removed
	+ * We do not have to take the XSRQ table lock here,
	+ * because CQs will be locked while SRQs are removed
	* from the table.
	*/
	- srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn);
	+ *cur_qp = NULL;
	+ srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table,
	+ ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
	if (!srq)
	return CQ_POLL_ERR;
	- } else if (!cur_qp \|\| (qpn & 0xffffff) != (cur_qp)->ibv_qp.qp_num) {
	- /*
	- * We do not have to take the QP table lock here,
	- * because CQs will be locked while QPs are removed
	- * from the table.
	- */
	- *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context),
	- qpn & 0xffffff);
	- if (!*cur_qp)
	- return CQ_POLL_ERR;
	+ } else {
	+ if (unlikely(!cur_qp \|\| (qpn != (cur_qp)->verbs_qp.qp.qp_num))) {
	+ /*
	+ * We do not have to take the QP table lock here,
	+ * because CQs will be locked while QPs are removed
	+ * from the table.
	+ */
	+ *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
	+ if (unlikely(!*cur_qp))
	+ return CQ_POLL_ERR;
	+ }
	+ if (is_exp) {
	+ wc->qp = &((*cur_qp)->verbs_qp.qp);
	+ exp_wc_flags \|= IBV_EXP_WC_QP;
	+ }
	+ srq = ((cur_qp)->verbs_qp.qp.srq) ? to_msrq((cur_qp)->verbs_qp.qp.srq) : NULL;
	}

	- wc->qp_num = qpn & 0xffffff;
	-
	if (is_send) {
	wq = &(*cur_qp)->sq;
	wqe_index = ntohs(cqe->wqe_index);
	@@ -257,112 +311,267 @@
	wqe_index = htons(cqe->wqe_index);
	wc->wr_id = srq->wrid[wqe_index];
	mlx4_free_srq_wqe(srq, wqe_index);
	- } else if ((*cur_qp)->ibv_qp.srq) {
	- srq = to_msrq((*cur_qp)->ibv_qp.srq);
	- wqe_index = htons(cqe->wqe_index);
	- wc->wr_id = srq->wrid[wqe_index];
	- mlx4_free_srq_wqe(srq, wqe_index);
	+ if (is_exp) {
	+ wc->srq = &(srq->verbs_srq.srq);
	+ exp_wc_flags \|= IBV_EXP_WC_SRQ;
	+ }
	} else {
	wq = &(*cur_qp)->rq;
	- wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
	+ wqe_index = wq->tail & (wq->wqe_cnt - 1);
	+ wc->wr_id = wq->wrid[wqe_index];
	++wq->tail;
	}

	- if (is_error) {
	- mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
	+ if (unlikely(is_error)) {
	+ mlx4_handle_error_cqe((struct mlx4_err_cqe *)cqe,
	+ (struct ibv_wc *)wc);
	return CQ_OK;
	}

	wc->status = IBV_WC_SUCCESS;

	+ if (timestamp_en && offsetof(struct ibv_exp_wc, timestamp) < wc_size) {
	+ /* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is
	+ * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't
	+ * supported */
	+ if (cq->creation_flags &
	+ IBV_EXP_CQ_TIMESTAMP_TO_SYS_TIME)
	+ wc->timestamp = 0;
	+ else {
	+ wc->timestamp =
	+ (uint64_t)(ntohl(cqe->timestamp_16_47) +
	+ !cqe->timestamp_0_15) << 16
	+ \| (uint64_t)ntohs(cqe->timestamp_0_15);
	+ exp_wc_flags \|= IBV_EXP_WC_WITH_TIMESTAMP;
	+ }
	+ }
	+
	if (is_send) {
	- wc->wc_flags = 0;
	switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
	+ case MLX4_OPCODE_CALC_RDMA_WRITE_IMM:
	case MLX4_OPCODE_RDMA_WRITE_IMM:
	- wc->wc_flags \|= IBV_WC_WITH_IMM;
	+ wc_flags \|= IBV_WC_WITH_IMM;
	case MLX4_OPCODE_RDMA_WRITE:
	- wc->opcode = IBV_WC_RDMA_WRITE;
	+ wc->exp_opcode = IBV_EXP_WC_RDMA_WRITE;
	break;
	case MLX4_OPCODE_SEND_IMM:
	- wc->wc_flags \|= IBV_WC_WITH_IMM;
	+ wc_flags \|= IBV_WC_WITH_IMM;
	case MLX4_OPCODE_SEND:
	- wc->opcode = IBV_WC_SEND;
	+ wc->exp_opcode = IBV_EXP_WC_SEND;
	break;
	case MLX4_OPCODE_RDMA_READ:
	- wc->opcode = IBV_WC_RDMA_READ;
	+ wc->exp_opcode = IBV_EXP_WC_RDMA_READ;
	wc->byte_len = ntohl(cqe->byte_cnt);
	break;
	case MLX4_OPCODE_ATOMIC_CS:
	- wc->opcode = IBV_WC_COMP_SWAP;
	+ wc->exp_opcode = IBV_EXP_WC_COMP_SWAP;
	wc->byte_len = 8;
	break;
	case MLX4_OPCODE_ATOMIC_FA:
	- wc->opcode = IBV_WC_FETCH_ADD;
	+ wc->exp_opcode = IBV_EXP_WC_FETCH_ADD;
	wc->byte_len = 8;
	break;
	+ case MLX4_OPCODE_ATOMIC_MASK_CS:
	+ wc->exp_opcode = IBV_EXP_WC_MASKED_COMP_SWAP;
	+ break;
	+ case MLX4_OPCODE_ATOMIC_MASK_FA:
	+ wc->exp_opcode = IBV_EXP_WC_MASKED_FETCH_ADD;
	+ break;
	+ case MLX4_OPCODE_LOCAL_INVAL:
	+ if (unlikely(!is_exp))
	+ return CQ_POLL_ERR;
	+ wc->exp_opcode = IBV_EXP_WC_LOCAL_INV;
	+ break;
	+ case MLX4_OPCODE_SEND_INVAL:
	+ wc->exp_opcode = IBV_EXP_WC_SEND;
	+ break;
	case MLX4_OPCODE_BIND_MW:
	- wc->opcode = IBV_WC_BIND_MW;
	+ wc->exp_opcode = IBV_EXP_WC_BIND_MW;
	break;
	default:
	/* assume it's a send completion */
	- wc->opcode = IBV_WC_SEND;
	+ wc->exp_opcode = IBV_EXP_WC_SEND;
	break;
	}
	} else {
	wc->byte_len = ntohl(cqe->byte_cnt);
	+ if ((cur_qp) && (cur_qp)->max_inlr_sg &&
	+ (cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) {
	+ rbuffs = (*cur_qp)->inlr_buff.buff[wqe_index].sg_list;
	+ list_len = (*cur_qp)->inlr_buff.buff[wqe_index].list_len;
	+ sbuff = mlx4_get_recv_wqe((*cur_qp), wqe_index);
	+ left = wc->byte_len;
	+ for (i = 0; (i < list_len) && left; i++) {
	+ size = min(rbuffs->rlen, left);
	+ memcpy(rbuffs->rbuff, sbuff, size);
	+ left -= size;
	+ rbuffs++;
	+ sbuff += size;
	+ }
	+ if (left) {
	+ wc->status = IBV_WC_LOC_LEN_ERR;
	+ return CQ_OK;
	+ }
	+ }

	switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
	case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
	- wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
	- wc->wc_flags = IBV_WC_WITH_IMM;
	+ wc->exp_opcode = IBV_EXP_WC_RECV_RDMA_WITH_IMM;
	+ wc_flags = IBV_WC_WITH_IMM;
	wc->imm_data = cqe->immed_rss_invalid;
	break;
	+ case MLX4_RECV_OPCODE_SEND_INVAL:
	+ if (unlikely(!is_exp))
	+ return CQ_POLL_ERR;
	+ wc->exp_opcode = IBV_EXP_WC_RECV;
	+ exp_wc_flags \|= IBV_EXP_WC_WITH_INV;
	+ wc->imm_data = ntohl(cqe->immed_rss_invalid);
	+ break;
	case MLX4_RECV_OPCODE_SEND:
	- wc->opcode = IBV_WC_RECV;
	- wc->wc_flags = 0;
	+ wc->exp_opcode = IBV_EXP_WC_RECV;
	+ wc_flags = 0;
	break;
	case MLX4_RECV_OPCODE_SEND_IMM:
	- wc->opcode = IBV_WC_RECV;
	- wc->wc_flags = IBV_WC_WITH_IMM;
	+ wc->exp_opcode = IBV_EXP_WC_RECV;
	+ wc_flags = IBV_WC_WITH_IMM;
	wc->imm_data = cqe->immed_rss_invalid;
	break;
	}

	- wc->slid = ntohs(cqe->rlid);
	- wc->sl = cqe->sl >> 4;
	+ if (!timestamp_en) {
	+ exp_wc_flags \|= IBV_EXP_WC_WITH_SLID;
	+ wc->slid = ntohs(cqe->rlid);
	+ }
	g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn);
	wc->src_qp = g_mlpath_rqpn & 0xffffff;
	wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
	- wc->wc_flags \|= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
	+ wc_flags \|= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
	wc->pkey_index = ntohl(cqe->immed_rss_invalid) & 0x7f;
	+ /* When working with xrc srqs, don't have qp to check link layer.
	+ * Using IB SL, should consider Roce. (TBD)
	+ */
	+ /* sl is invalid when timestamp is used */
	+ if (!timestamp_en) {
	+ if ((cur_qp) && (cur_qp)->link_layer ==
	+ IBV_LINK_LAYER_ETHERNET)
	+ wc->sl = ntohs(cqe->sl_vid) >> 13;
	+ else
	+ wc->sl = ntohs(cqe->sl_vid) >> 12;
	+ exp_wc_flags \|= IBV_EXP_WC_WITH_SL;
	+ }
	+ if (is_exp) {
	+ if ((cur_qp) && ((cur_qp)->qp_cap_cache &
	+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP)) {
	+ /* Only ConnectX-3 Pro reports checksum for now) */
	+ exp_wc_flags \|=
	+ MLX4_TRANSPOSE(cqe->badfcs_enc,
	+ MLX4_CQE_STATUS_L4_CSUM,
	+ (uint64_t)IBV_EXP_WC_RX_TCP_UDP_CSUM_OK) \|
	+ mlx4_transpose_uint16_t(cqe->status,
	+ htons(MLX4_CQE_STATUS_IPOK),
	+ (uint64_t)IBV_EXP_WC_RX_IP_CSUM_OK) \|
	+ mlx4_transpose_uint16_t(cqe->status,
	+ htons(MLX4_CQE_STATUS_IPV4),
	+ (uint64_t)IBV_EXP_WC_RX_IPV4_PACKET) \|
	+ mlx4_transpose_uint16_t(cqe->status,
	+ htons(MLX4_CQE_STATUS_IPV6),
	+ (uint64_t)IBV_EXP_WC_RX_IPV6_PACKET) \|
	+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
	+ htonl(MLX4_CQE_L2_TUNNEL),
	+ (uint64_t)IBV_EXP_WC_RX_TUNNEL_PACKET) \|
	+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
	+ htonl(MLX4_CQE_L2_TUNNEL_IPOK),
	+ (uint64_t)IBV_EXP_WC_RX_OUTER_IP_CSUM_OK) \|
	+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
	+ htonl(MLX4_CQE_L2_TUNNEL_L4_CSUM),
	+ (uint64_t)IBV_EXP_WC_RX_OUTER_TCP_UDP_CSUM_OK) \|
	+ mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
	+ htonl(MLX4_CQE_L2_TUNNEL_IPV4),
	+ (uint64_t)IBV_EXP_WC_RX_OUTER_IPV4_PACKET);
	+ exp_wc_flags \|=
	+ MLX4_TRANSPOSE(~exp_wc_flags,
	+ IBV_EXP_WC_RX_OUTER_IPV4_PACKET,
	+ IBV_EXP_WC_RX_OUTER_IPV6_PACKET);
	+ }
	+ }
	}

	+ if (is_exp)
	+ wc->exp_wc_flags = exp_wc_flags \| (uint64_t)wc_flags;
	+
	+ ((struct ibv_wc *)wc)->wc_flags = wc_flags;
	+
	return CQ_OK;
	}

	-int mlx4_poll_cq(struct ibv_cq ibcq, int ne, struct ibv_wc wc)
	+#if defined(__amd64__) \|\| defined(__i386__)
	+static inline unsigned long get_cycles()
	+{
	+ unsigned low, high;
	+ unsigned long long val;
	+ asm volatile ("rdtsc" : "=a" (low), "=d" (high));
	+ val = high;
	+ val = (val << 32) \| low;
	+ return val;
	+}
	+#else
	+static inline unsigned long get_cycles()
	+{
	+ return 0;
	+}
	+#endif
	+
	+static void mlx4_stall_poll_cq()
	+{
	+ int i;
	+
	+ for (i = 0; i < mlx4_stall_num_loop; i++)
	+ (void)get_cycles();
	+}
	+
	+int mlx4_poll_cq(struct ibv_cq ibcq, int ne, struct ibv_exp_wc wc,
	+ uint32_t wc_size, int is_exp)
	{
	struct mlx4_cq *cq = to_mcq(ibcq);
	struct mlx4_qp *qp = NULL;
	int npolled;
	int err = CQ_OK;

	- pthread_spin_lock(&cq->lock);
	-
	+ if (unlikely(cq->stall_next_poll)) {
	+ cq->stall_next_poll = 0;
	+ mlx4_stall_poll_cq();
	+ }
	+ mlx4_lock(&cq->lock);
	+
	for (npolled = 0; npolled < ne; ++npolled) {
	- err = mlx4_poll_one(cq, &qp, wc + npolled);
	- if (err != CQ_OK)
	+ err = mlx4_poll_one(cq, &qp, ((void )wc) + npolled wc_size,
	+ wc_size, is_exp);
	+ if (unlikely(err != CQ_OK))
	break;
	}

	- if (npolled)
	- update_cons_index(cq);
	+ if (likely(npolled \|\| err == CQ_POLL_ERR))
	+ mlx4_update_cons_index(cq);

	- pthread_spin_unlock(&cq->lock);
	+ mlx4_unlock(&cq->lock);

	+ if (unlikely(cq->stall_enable && err == CQ_EMPTY))
	+ cq->stall_next_poll = 1;
	+
	return err == CQ_POLL_ERR ? err : npolled;
	}

	+int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries,
	+ struct ibv_exp_wc *wc, uint32_t wc_size)
	+{
	+ return mlx4_poll_cq(ibcq, num_entries, wc, wc_size, 1);
	+}
	+
	+int mlx4_poll_ibv_cq(struct ibv_cq ibcq, int ne, struct ibv_wc wc)
	+{
	+ return mlx4_poll_cq(ibcq, ne, (struct ibv_exp_wc )wc, sizeof(wc), 0);
	+}
	+
	int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
	{
	struct mlx4_cq *cq = to_mcq(ibvcq);
	@@ -402,12 +611,10 @@
	uint32_t prod_index;
	uint8_t owner_bit;
	int nfreed = 0;
	- int is_xrc_srq = 0;
	int cqe_inc = cq->cqe_size == 64 ? 1 : 0;

	- if (srq && srq->ibv_srq.xrc_cq)
	- is_xrc_srq = 1;
	-
	+ if (cq->last_qp && cq->last_qp->verbs_qp.qp.qp_num == qpn)
	+ cq->last_qp = NULL;
	/*
	* First we need to find the current producer index, so we
	* know where to start cleaning from. It doesn't matter if HW
	@@ -426,12 +633,12 @@
	while ((int) --prod_index - (int) cq->cons_index >= 0) {
	cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
	cqe += cqe_inc;
	- if (is_xrc_srq &&
	- (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) &&
	+ if (srq && srq->ext_srq &&
	+ ntohl(cqe->g_mlpath_rqpn & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num &&
	!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
	mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
	++nfreed;
	- } else if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
	+ } else if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
	if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
	mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
	++nfreed;
	@@ -452,22 +659,22 @@
	* updating consumer index.
	*/
	wmb();
	- update_cons_index(cq);
	+ mlx4_update_cons_index(cq);
	}
	}

	void mlx4_cq_clean(struct mlx4_cq cq, uint32_t qpn, struct mlx4_srq srq)
	{
	- pthread_spin_lock(&cq->lock);
	+ mlx4_lock(&cq->lock);
	__mlx4_cq_clean(cq, qpn, srq);
	- pthread_spin_unlock(&cq->lock);
	+ mlx4_unlock(&cq->lock);
	}

	int mlx4_get_outstanding_cqes(struct mlx4_cq *cq)
	{
	uint32_t i;

	- for (i = cq->cons_index; get_sw_cqe(cq, (i & cq->ibv_cq.cqe)); ++i)
	+ for (i = cq->cons_index; get_sw_cqe(cq, i); ++i)
	;

	return i - cq->cons_index;
	@@ -496,13 +703,491 @@
	++cq->cons_index;
	}

	-int mlx4_alloc_cq_buf(struct mlx4_device dev, struct mlx4_buf buf, int nent,
	+int mlx4_alloc_cq_buf(struct mlx4_context mctx, struct mlx4_buf buf, int nent,
	int entry_size)
	{
	- if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size),
	- dev->page_size))
	+ struct mlx4_device *dev = to_mdev(mctx->ibv_ctx.device);
	+ int ret;
	+ enum mlx4_alloc_type alloc_type;
	+ enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
	+
	+ if (mlx4_use_huge(&mctx->ibv_ctx, "HUGE_CQ"))
	+ default_alloc_type = MLX4_ALLOC_TYPE_HUGE;
	+
	+ mlx4_get_alloc_type(&mctx->ibv_ctx, MLX4_CQ_PREFIX, &alloc_type,
	+ default_alloc_type);
	+
	+ ret = mlx4_alloc_prefered_buf(mctx, buf,
	+ align(nent * entry_size, dev->page_size),
	+ dev->page_size,
	+ alloc_type,
	+ MLX4_CQ_PREFIX);
	+
	+ if (ret)
	return -1;
	+
	memset(buf->buf, 0, nent * entry_size);

	return 0;
	}
	+
	+/*
	+ * poll family functions
	+ */
	+static inline int drain_rx(struct mlx4_cq cq, struct mlx4_cqe cqe,
	+ struct mlx4_qp cur_qp, uint8_t buf, uint32_t *inl) __attribute__((always_inline));
	+static inline int drain_rx(struct mlx4_cq cq, struct mlx4_cqe cqe,
	+ struct mlx4_qp cur_qp, uint8_t buf, uint32_t *inl)
	+{
	+ struct mlx4_srq *srq;
	+ uint32_t qpn;
	+ uint16_t wqe_index;
	+
	+ qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
	+
	+
	+ if (unlikely(!cur_qp \|\| (qpn != cur_qp->verbs_qp.qp.qp_num))) {
	+ if (unlikely(qpn & MLX4_XRC_QPN_BIT)) {
	+ /*
	+ * We do not have to take the XSRQ table lock here,
	+ * because CQs will be locked while SRQs are removed
	+ * from the table.
	+ */
	+ cur_qp = NULL;
	+ srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table,
	+ ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
	+ if (!srq)
	+ return CQ_POLL_ERR;
	+
	+ /* Advance indexes only on success */
	+ wqe_index = htons(cqe->wqe_index);
	+ mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index);
	+
	+ ++cq->cons_index;
	+
	+ return CQ_OK;
	+ }
	+
	+ /*
	+ * We do not have to take the QP table lock here,
	+ * because CQs will be locked while QPs are removed
	+ * from the table.
	+ */
	+ cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
	+ if (unlikely(!cur_qp))
	+ return CQ_POLL_ERR;
	+ cq->last_qp = cur_qp;
	+ }
	+
	+ if (!cur_qp->max_inlr_sg) {
	+ /* Advance indexes only on success to enable getting
	+ * the full CQE with ibv_poll_cq in case of failure
	+ */
	+ if (unlikely(cur_qp->verbs_qp.qp.srq)) {
	+ wqe_index = htons(cqe->wqe_index);
	+ mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index);
	+ } else {
	+ ++cur_qp->rq.tail;
	+ }
	+ ++cq->cons_index;
	+
	+ return CQ_OK;
	+ }
	+
	+ /* We get here only when cur_qp->max_inlr_sg != 0 */
	+ if (likely(cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) {
	+ int size;
	+ int left;
	+ int list_len;
	+ int i;
	+ struct mlx4_inlr_rbuff *rbuffs;
	+ uint8_t *sbuff;
	+ int is_error;
	+
	+ /* include checksum as work around for calc opcode */
	+ is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
	+ MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff);
	+ if (unlikely(is_error))
	+ return CQ_POLL_ERR;
	+
	+ wqe_index = cur_qp->rq.tail & (cur_qp->rq.wqe_cnt - 1);
	+ sbuff = mlx4_get_recv_wqe(cur_qp, wqe_index);
	+ left = ntohl(cqe->byte_cnt);
	+ if (likely(buf)) {
	+ *inl = 1;
	+ memcpy(buf, sbuff, left);
	+ } else {
	+ rbuffs = cur_qp->inlr_buff.buff[wqe_index].sg_list;
	+ list_len = cur_qp->inlr_buff.buff[wqe_index].list_len;
	+ for (i = 0; (i < list_len) && left; i++) {
	+ size = min(rbuffs->rlen, left);
	+ memcpy(rbuffs->rbuff, sbuff, size);
	+ left -= size;
	+ rbuffs++;
	+ sbuff += size;
	+ }
	+ if (left)
	+ return CQ_POLL_ERR;
	+ }
	+ }
	+
	+ /* Advance indexes only on success to enable getting
	+ * the full CQE with ibv_poll_cq in case of failure
	+ */
	+ ++cur_qp->rq.tail;
	+
	+ ++cq->cons_index;
	+
	+ return CQ_OK;
	+}
	+
	+static inline int update_sq_tail(struct mlx4_cq cq, struct mlx4_cqe cqe,
	+ struct mlx4_qp *cur_qp) __attribute__((always_inline));
	+static inline int update_sq_tail(struct mlx4_cq cq, struct mlx4_cqe cqe,
	+ struct mlx4_qp *cur_qp)
	+{
	+ uint32_t qpn;
	+
	+ qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
	+ if (unlikely(!cur_qp \|\| (qpn != cur_qp->verbs_qp.qp.qp_num))) {
	+ /*
	+ * We do not have to take the QP table lock here,
	+ * because CQs will be locked while QPs are removed
	+ * from the table.
	+ */
	+ cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
	+ if (unlikely(!cur_qp))
	+ return CQ_POLL_ERR;
	+ cq->last_qp = cur_qp;
	+ }
	+
	+ /* Advance indexes only on success */
	+ cur_qp->sq.tail += (uint16_t)(ntohs(cqe->wqe_index) - (uint16_t)cur_qp->sq.tail);
	+ ++cq->cons_index;
	+
	+ return CQ_OK;
	+}
	+
	+static inline struct mlx4_cqe get_next_cqe(struct mlx4_cq cq, int const cqe_size) __attribute__((always_inline));
	+static inline struct mlx4_cqe get_next_cqe(struct mlx4_cq cq, int const cqe_size)
	+{
	+ int cqe_off = (cqe_size & 64) >> 1; /* CQE offset is 32 bytes in case cqe_size == 64 */
	+ struct mlx4_cqe cqe = cq->buf.buf + (cq->cons_index & cq->ibv_cq.cqe) cqe_size + cqe_off;
	+
	+ if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
	+ !!(cq->cons_index & (cq->ibv_cq.cqe + 1)))
	+ return NULL;
	+
	+ VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
	+
	+ /*
	+ * Make sure we read CQ entry contents after we've checked the
	+ * ownership bit.
	+ */
	+ rmb();
	+
	+ return cqe;
	+}
	+
	+static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size) __attribute__((always_inline));
	+static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+ struct mlx4_cqe *cqe;
	+ int npolled;
	+ int err = CQ_OK;
	+
	+ if (unlikely(use_lock))
	+ mlx4_lock(&cq->lock);
	+
	+ for (npolled = 0; npolled < max_entries; ++npolled) {
	+ cqe = get_next_cqe(cq, cqe_size);
	+ if (!cqe) {
	+ err = CQ_EMPTY;
	+ break;
	+ }
	+ /*
	+ * Make sure we read CQ entry contents after we've checked the
	+ * ownership bit.
	+ */
	+ rmb();
	+
	+ if (likely(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
	+ err = update_sq_tail(cq, cqe, cq->last_qp);
	+ else
	+ err = drain_rx(cq, cqe, cq->last_qp, NULL, NULL);
	+
	+ if (unlikely(err != CQ_OK))
	+ break;
	+ }
	+
	+ if (likely(npolled)) {
	+ mlx4_update_cons_index(cq);
	+ err = CQ_OK;
	+ }
	+
	+ if (unlikely(use_lock))
	+ mlx4_unlock(&cq->lock);
	+
	+ return err == CQ_POLL_ERR ? -1 : npolled;
	+}
	+
	+static inline int32_t get_flags(struct mlx4_qp cur_qp, struct mlx4_cqe cqe) __attribute__((always_inline));
	+static inline int32_t get_flags(struct mlx4_qp cur_qp, struct mlx4_cqe cqe)
	+{
	+ /* Only ConnectX-3 Pro reports checksum for now) */
	+ if (likely(cur_qp && (cur_qp->qp_cap_cache &
	+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP))) {
	+ int32_t flags;
	+ int32_t tmp;
	+
	+ /*
	+ * The relevant bits are in different locations on their
	+ * CQE fields therefore we can join them in one 32bit
	+ * variable.
	+ */
	+ tmp = (cqe->badfcs_enc & MLX4_CQE_STATUS_L4_CSUM) \|
	+ (ntohs(cqe->status) & (MLX4_CQE_STATUS_IPOK \|
	+ MLX4_CQE_STATUS_IPV4 \|
	+ MLX4_CQE_STATUS_IPV6)) \|
	+ (ntohl(cqe->vlan_my_qpn) & (MLX4_CQE_L2_TUNNEL \|
	+ MLX4_CQE_L2_TUNNEL_IPOK \|
	+ MLX4_CQE_L2_TUNNEL_L4_CSUM \|
	+ MLX4_CQE_L2_TUNNEL_IPV4));
	+ if (likely(tmp == cur_qp->cached_rx_csum_flags)) {
	+ flags = cur_qp->transposed_rx_csum_flags;
	+ } else {
	+ flags = mlx4_transpose(tmp, MLX4_CQE_STATUS_IPOK, IBV_EXP_CQ_RX_IP_CSUM_OK) \|
	+ mlx4_transpose(tmp, MLX4_CQE_STATUS_L4_CSUM, IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK) \|
	+ mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV4, IBV_EXP_CQ_RX_IPV4_PACKET) \|
	+ mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV6, IBV_EXP_CQ_RX_IPV6_PACKET) \|
	+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL, IBV_EXP_CQ_RX_TUNNEL_PACKET) \|
	+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPOK, IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK) \|
	+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_L4_CSUM, IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK) \|
	+ mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPV4, IBV_EXP_CQ_RX_OUTER_IPV4_PACKET) \|
	+ mlx4_transpose(~tmp, MLX4_CQE_L2_TUNNEL_IPV4, IBV_EXP_CQ_RX_OUTER_IPV6_PACKET);
	+ cur_qp->cached_rx_csum_flags = tmp;
	+ cur_qp->transposed_rx_csum_flags = flags;
	+ }
	+
	+ return flags;
	+ }
	+
	+ return 0;
	+}
	+
	+static inline int32_t poll_length(struct ibv_cq ibcq, void buf, uint32_t *inl,
	+ const int use_lock, const int cqe_size,
	+ uint32_t *flags) __attribute__((always_inline));
	+static inline int32_t poll_length(struct ibv_cq ibcq, void buf, uint32_t *inl,
	+ const int use_lock, const int cqe_size,
	+ uint32_t *flags)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+ struct mlx4_cqe *cqe;
	+ int32_t size = 0;
	+ int err;
	+
	+ if (unlikely(use_lock))
	+ mlx4_lock(&cq->lock);
	+
	+ cqe = get_next_cqe(cq, cqe_size);
	+ if (cqe) {
	+ /*
	+ * Make sure we read CQ entry contents after we've checked the
	+ * ownership bit.
	+ */
	+ rmb();
	+ if (likely(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))) {
	+ err = drain_rx(cq, cqe, cq->last_qp, buf, inl);
	+ if (likely(err == CQ_OK)) {
	+ size = ntohl(cqe->byte_cnt);
	+ if (flags)
	+ *flags = get_flags(cq->last_qp, cqe);
	+ mlx4_update_cons_index(cq);
	+ }
	+ } else {
	+ err = CQ_POLL_ERR;
	+ }
	+
	+ } else {
	+ err = CQ_EMPTY;
	+ }
	+
	+
	+ if (unlikely(use_lock))
	+ mlx4_unlock(&cq->lock);
	+
	+ return err == CQ_POLL_ERR ? -1 : size;
	+}
	+
	+int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+
	+ return poll_cnt(ibcq, max, 1, cq->cqe_size);
	+}
	+
	+int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+
	+ return poll_cnt(ibcq, max, 0, cq->cqe_size);
	+}
	+
	+int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max)
	+{
	+ return poll_cnt(ibcq, max, 0, 32);
	+}
	+
	+int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max)
	+{
	+ return poll_cnt(ibcq, max, 0, 64);
	+}
	+
	+int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max)
	+{
	+ return poll_cnt(ibcq, max, 0, 128);
	+}
	+
	+int32_t mlx4_poll_length_safe(struct ibv_cq ibcq, void buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_safe(struct ibv_cq ibcq, void buf, uint32_t *inl)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+
	+ return poll_length(ibcq, buf, inl, 1, cq->cqe_size, NULL);
	+}
	+
	+int32_t mlx4_poll_length_unsafe_other(struct ibv_cq ibcq, void buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_unsafe_other(struct ibv_cq ibcq, void buf, uint32_t *inl)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+
	+ return poll_length(ibcq, buf, inl, 0, cq->cqe_size, NULL);
	+}
	+
	+int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq cq, void buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq cq, void buf, uint32_t *inl)
	+{
	+ return poll_length(cq, buf, inl, 0, 32, NULL);
	+}
	+
	+int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq cq, void buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq cq, void buf, uint32_t *inl)
	+{
	+ return poll_length(cq, buf, inl, 0, 64, NULL);
	+}
	+
	+int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq cq, void buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq cq, void buf, uint32_t *inl)
	+{
	+ return poll_length(cq, buf, inl, 0, 128, NULL);
	+}
	+
	+int32_t mlx4_poll_length_flags_safe(struct ibv_cq ibcq, void buf, uint32_t inl, uint32_t flags) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_flags_safe(struct ibv_cq ibcq, void buf, uint32_t inl, uint32_t flags)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+
	+ return poll_length(ibcq, buf, inl, 1, cq->cqe_size, flags);
	+}
	+
	+int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq ibcq, void buf, uint32_t inl, uint32_t flags) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq ibcq, void buf, uint32_t inl, uint32_t flags)
	+{
	+ struct mlx4_cq *cq = to_mcq(ibcq);
	+
	+ return poll_length(ibcq, buf, inl, 0, cq->cqe_size, flags);
	+}
	+
	+int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq cq, void buf, uint32_t inl, uint32_t flags) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq cq, void buf, uint32_t inl, uint32_t flags)
	+{
	+ return poll_length(cq, buf, inl, 0, 32, flags);
	+}
	+
	+int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq cq, void buf, uint32_t inl, uint32_t flags) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq cq, void buf, uint32_t inl, uint32_t flags)
	+{
	+ return poll_length(cq, buf, inl, 0, 64, flags);
	+}
	+
	+int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq cq, void buf, uint32_t inl, uint32_t flags) __MLX4_ALGN_FUNC__;
	+int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq cq, void buf, uint32_t inl, uint32_t flags)
	+{
	+ return poll_length(cq, buf, inl, 0, 128, flags);
	+}
	+
	+static struct ibv_exp_cq_family mlx4_poll_cq_family_safe = {
	+ .poll_cnt = mlx4_poll_cnt_safe,
	+ .poll_length = mlx4_poll_length_safe,
	+ .poll_length_flags = mlx4_poll_length_flags_safe
	+};
	+
	+enum mlx4_poll_cq_cqe_sizes {
	+ MLX4_POLL_CQ_CQE_32 = 0,
	+ MLX4_POLL_CQ_CQE_64 = 1,
	+ MLX4_POLL_CQ_CQE_128 = 2,
	+ MLX4_POLL_CQ_CQE_OTHER = 3,
	+ MLX4_POLL_CQ_NUM_CQE_SIZES = 4,
	+};
	+
	+static struct ibv_exp_cq_family mlx4_poll_cq_family_unsafe_tbl[MLX4_POLL_CQ_NUM_CQE_SIZES] = {
	+ [MLX4_POLL_CQ_CQE_32] = {
	+ .poll_cnt = mlx4_poll_cnt_unsafe_cqe32,
	+ .poll_length = mlx4_poll_length_unsafe_cqe32,
	+ .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe32
	+ },
	+ [MLX4_POLL_CQ_CQE_64] = {
	+ .poll_cnt = mlx4_poll_cnt_unsafe_cqe64,
	+ .poll_length = mlx4_poll_length_unsafe_cqe64,
	+ .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe64
	+ },
	+ [MLX4_POLL_CQ_CQE_128] = {
	+ .poll_cnt = mlx4_poll_cnt_unsafe_cqe128,
	+ .poll_length = mlx4_poll_length_unsafe_cqe128,
	+ .poll_length_flags = mlx4_poll_length_flags_unsafe_cqe128
	+ },
	+ [MLX4_POLL_CQ_CQE_OTHER] = {
	+ .poll_cnt = mlx4_poll_cnt_unsafe_other,
	+ .poll_length = mlx4_poll_length_unsafe_other,
	+ .poll_length_flags = mlx4_poll_length_flags_unsafe_other
	+ },
	+};
	+
	+struct ibv_exp_cq_family mlx4_get_poll_cq_family(struct mlx4_cq cq,
	+ struct ibv_exp_query_intf_params *params,
	+ enum ibv_exp_query_intf_status *status)
	+{
	+ enum mlx4_poll_cq_cqe_sizes cqe_size = MLX4_POLL_CQ_CQE_OTHER;
	+
	+ if (params->flags) {
	+ fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for CQ family\n", params->flags);
	+ *status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED;
	+
	+ return NULL;
	+ }
	+ if (params->family_flags) {
	+ fprintf(stderr, PFX "Family flags(0x%x) are not supported for CQ family\n", params->family_flags);
	+ *status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED;
	+
	+ return NULL;
	+ }
	+
	+ if (cq->model_flags & MLX4_CQ_MODEL_FLAG_THREAD_SAFE)
	+ return &mlx4_poll_cq_family_safe;
	+
	+ if (cq->cqe_size == 32)
	+ cqe_size = MLX4_POLL_CQ_CQE_32;
	+ else if (cq->cqe_size == 64)
	+ cqe_size = MLX4_POLL_CQ_CQE_64;
	+ else if (cq->cqe_size == 128)
	+ cqe_size = MLX4_POLL_CQ_CQE_128;
	+
	+ return &mlx4_poll_cq_family_unsafe_tbl[cqe_size];
	+}
	Index: contrib/ofed/libmlx4/src/doorbell.h
	===================================================================
	--- contrib/ofed/libmlx4/src/doorbell.h
	+++ contrib/ofed/libmlx4/src/doorbell.h
	@@ -33,7 +33,8 @@
	#ifndef DOORBELL_H
	#define DOORBELL_H

	-#ifdef __LP64__
	+#if __LP64__
	+
	#if __BYTE_ORDER == __LITTLE_ENDIAN
	# define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 \| val[0])
	#elif __BYTE_ORDER == __BIG_ENDIAN
	@@ -51,10 +52,10 @@

	static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
	{
	- pthread_spin_lock(&ctx->uar_lock);
	+ mlx4_spin_lock(&ctx->uar_lock);
	(volatile uint32_t ) (ctx->uar + offset) = val[0];
	(volatile uint32_t ) (ctx->uar + offset + 4) = val[1];
	- pthread_spin_unlock(&ctx->uar_lock);
	+ mlx4_spin_unlock(&ctx->uar_lock);
	}

	#endif
	Index: contrib/ofed/libmlx4/src/list.h
	===================================================================
	--- /dev/null
	+++ contrib/ofed/libmlx4/src/list.h
	@@ -0,0 +1,330 @@
	+#ifndef _LINUX_LIST_H
	+#define _LINUX_LIST_H
	+
	+/*
	+ * These are non-NULL pointers that will result in page faults
	+ * under normal circumstances, used to verify that nobody uses
	+ * non-initialized list entries.
	+ */
	+#define LIST_POISON1 ((void *) 0x00100100)
	+#define LIST_POISON2 ((void *) 0x00200200)
	+
	+/*
	+ * Simple doubly linked list implementation.
	+ *
	+ * Some of the internal functions ("__xxx") are useful when
	+ * manipulating whole lists rather than single entries, as
	+ * sometimes we already know the next/prev entries and we can
	+ * generate better code by using them directly rather than
	+ * using the generic single-entry routines.
	+ */
	+
	+struct list_head {
	+ struct list_head next, prev;
	+};
	+
	+#define LIST_HEAD_INIT(name) { &(name), &(name) }
	+
	+#define LIST_HEAD(name) \
	+ struct list_head name = LIST_HEAD_INIT(name)
	+
	+#define INIT_LIST_HEAD(ptr) do { \
	+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
	+} while (0)
	+
	+/*
	+ * Insert a new entry between two known consecutive entries.
	+ *
	+ * This is only for internal list manipulation where we know
	+ * the prev/next entries already!
	+ */
	+static inline void __list_add(struct list_head *new,
	+ struct list_head *prev,
	+ struct list_head *next)
	+{
	+ next->prev = new;
	+ new->next = next;
	+ new->prev = prev;
	+ prev->next = new;
	+}
	+
	+/**
	+ * list_add - add a new entry
	+ * @new: new entry to be added
	+ * @head: list head to add it after
	+ *
	+ * Insert a new entry after the specified head.
	+ * This is good for implementing stacks.
	+ */
	+static inline void list_add(struct list_head new, struct list_head head)
	+{
	+ __list_add(new, head, head->next);
	+}
	+
	+/**
	+ * list_add_tail - add a new entry
	+ * @new: new entry to be added
	+ * @head: list head to add it before
	+ *
	+ * Insert a new entry before the specified head.
	+ * This is useful for implementing queues.
	+ */
	+static inline void list_add_tail(struct list_head new, struct list_head head)
	+{
	+ __list_add(new, head->prev, head);
	+}
	+
	+/*
	+ * Delete a list entry by making the prev/next entries
	+ * point to each other.
	+ *
	+ * This is only for internal list manipulation where we know
	+ * the prev/next entries already!
	+ */
	+static inline void __list_del(struct list_head prev, struct list_head next)
	+{
	+ next->prev = prev;
	+ prev->next = next;
	+}
	+
	+/**
	+ * list_del - deletes entry from list.
	+ * @entry: the element to delete from the list.
	+ * Note: list_empty on entry does not return true after this, the entry is
	+ * in an undefined state.
	+ */
	+static inline void list_del(struct list_head *entry)
	+{
	+ __list_del(entry->prev, entry->next);
	+ entry->next = LIST_POISON1;
	+ entry->prev = LIST_POISON2;
	+}
	+
	+/**
	+ * list_del_init - deletes entry from list and reinitialize it.
	+ * @entry: the element to delete from the list.
	+ */
	+static inline void list_del_init(struct list_head *entry)
	+{
	+ __list_del(entry->prev, entry->next);
	+ INIT_LIST_HEAD(entry);
	+}
	+
	+/**
	+ * list_move - delete from one list and add as another's head
	+ * @list: the entry to move
	+ * @head: the head that will precede our entry
	+ */
	+static inline void list_move(struct list_head list, struct list_head head)
	+{
	+ __list_del(list->prev, list->next);
	+ list_add(list, head);
	+}
	+
	+/**
	+ * list_move_tail - delete from one list and add as another's tail
	+ * @list: the entry to move
	+ * @head: the head that will follow our entry
	+ */
	+static inline void list_move_tail(struct list_head *list,
	+ struct list_head *head)
	+{
	+ __list_del(list->prev, list->next);
	+ list_add_tail(list, head);
	+}
	+
	+/**
	+ * list_empty - tests whether a list is empty
	+ * @head: the list to test.
	+ */
	+static inline int list_empty(const struct list_head *head)
	+{
	+ return head->next == head;
	+}
	+
	+/**
	+ * list_empty_careful - tests whether a list is
	+ * empty _and_ checks that no other CPU might be
	+ * in the process of still modifying either member
	+ *
	+ * NOTE: using list_empty_careful() without synchronization
	+ * can only be safe if the only activity that can happen
	+ * to the list entry is list_del_init(). Eg. it cannot be used
	+ * if another CPU could re-list_add() it.
	+ *
	+ * @head: the list to test.
	+ */
	+static inline int list_empty_careful(const struct list_head *head)
	+{
	+ struct list_head *next = head->next;
	+ return (next == head) && (next == head->prev);
	+}
	+
	+static inline void __list_splice(struct list_head *list,
	+ struct list_head *head)
	+{
	+ struct list_head *first = list->next;
	+ struct list_head *last = list->prev;
	+ struct list_head *at = head->next;
	+
	+ first->prev = head;
	+ head->next = first;
	+
	+ last->next = at;
	+ at->prev = last;
	+}
	+
	+/**
	+ * list_splice - join two lists
	+ * @list: the new list to add.
	+ * @head: the place to add it in the first list.
	+ */
	+static inline void list_splice(struct list_head list, struct list_head head)
	+{
	+ if (!list_empty(list))
	+ __list_splice(list, head);
	+}
	+
	+/**
	+ * list_splice_init - join two lists and reinitialise the emptied list.
	+ * @list: the new list to add.
	+ * @head: the place to add it in the first list.
	+ *
	+ * The list at @list is reinitialised
	+ */
	+static inline void list_splice_init(struct list_head *list,
	+ struct list_head *head)
	+{
	+ if (!list_empty(list)) {
	+ __list_splice(list, head);
	+ INIT_LIST_HEAD(list);
	+ }
	+}
	+
	+#ifndef offsetof
	+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
	+#endif
	+
	+/**
	+ * container_of - cast a member of a structure out to the containing structure
	+ *
	+ * @ptr: the pointer to the member.
	+ * @type: the type of the container struct this is embedded in.
	+ * @member: the name of the member within the struct.
	+ *
	+ */
	+#ifndef container_of
	+#define container_of(ptr, type, member) ({ \
	+ const typeof(((type )0)->member)__mptr = (ptr); \
	+ (type )((char )__mptr - offsetof(type, member)); })
	+#endif
	+
	+/**
	+ * list_entry - get the struct for this entry
	+ * @ptr: the &struct list_head pointer.
	+ * @type: the type of the struct this is embedded in.
	+ * @member: the name of the list_struct within the struct.
	+ */
	+#define list_entry(ptr, type, member) \
	+ container_of(ptr, type, member)
	+
	+/**
	+ * list_for_each - iterate over a list
	+ * @pos: the &struct list_head to use as a loop counter.
	+ * @head: the head for your list.
	+ */
	+#define list_for_each(pos, head) \
	+ for (pos = (head)->next; prefetch(pos->next), pos != (head); \
	+ pos->next)
	+
	+/**
	+ * __list_for_each - iterate over a list
	+ * @pos: the &struct list_head to use as a loop counter.
	+ * @head: the head for your list.
	+ *
	+ * This variant differs from list_for_each() in that it's the
	+ * simplest possible list iteration code, no prefetching is done.
	+ * Use this for code that knows the list to be very short (empty
	+ * or 1 entry) most of the time.
	+ */
	+#define __list_for_each(pos, head) \
	+ for (pos = (head)->next; pos != (head); pos = pos->next)
	+
	+/**
	+ * list_for_each_prev - iterate over a list backwards
	+ * @pos: the &struct list_head to use as a loop counter.
	+ * @head: the head for your list.
	+ */
	+#define list_for_each_prev(pos, head) \
	+ for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \
	+ pos = pos->prev)
	+
	+/**
	+ * list_for_each_safe - iterate over a list safe against removal of list entry
	+ * @pos: the &struct list_head to use as a loop counter.
	+ * @n: another &struct list_head to use as temporary storage
	+ * @head: the head for your list.
	+ */
	+#define list_for_each_safe(pos, n, head) \
	+ for (pos = (head)->next, n = pos->next; pos != (head); \
	+ pos = n, n = pos->next)
	+
	+/**
	+ * list_for_each_entry - iterate over list of given type
	+ * @pos: the type * to use as a loop counter.
	+ * @head: the head for your list.
	+ * @member: the name of the list_struct within the struct.
	+ */
	+#define list_for_each_entry(pos, head, member) \
	+ for (pos = list_entry((head)->next, typeof(*pos), member); \
	+ prefetch(pos->member.next), &pos->member != (head); \
	+ pos = list_entry(pos->member.next, typeof(*pos), member))
	+
	+/**
	+ * list_for_each_entry_reverse - iterate backwards over list of given type.
	+ * @pos: the type * to use as a loop counter.
	+ * @head: the head for your list.
	+ * @member: the name of the list_struct within the struct.
	+ */
	+#define list_for_each_entry_reverse(pos, head, member) \
	+ for (pos = list_entry((head)->prev, typeof(*pos), member); \
	+ prefetch(pos->member.prev), &pos->member != (head); \
	+ pos = list_entry(pos->member.prev, typeof(*pos), member))
	+
	+/**
	+ * list_prepare_entry - prepare a pos entry for use as a start point in
	+ * list_for_each_entry_continue
	+ * @pos: the type * to use as a start point
	+ * @head: the head of the list
	+ * @member: the name of the list_struct within the struct.
	+ */
	+#define list_prepare_entry(pos, head, member) \
	+ ((pos) ? : list_entry(head, typeof(*pos), member))
	+
	+/**
	+ * list_for_each_entry_continue - iterate over list of given type
	+ * continuing after existing point
	+ * @pos: the type * to use as a loop counter.
	+ * @head: the head for your list.
	+ * @member: the name of the list_struct within the struct.
	+ */
	+#define list_for_each_entry_continue(pos, head, member) \
	+ for (pos = list_entry(pos->member.next, typeof(*pos), member); \
	+ prefetch(pos->member.next), &pos->member != (head); \
	+ pos = list_entry(pos->member.next, typeof(*pos), member))
	+
	+/**
	+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
	+ * @pos: the type * to use as a loop counter.
	+ * @n: another type * to use as temporary storage
	+ * @head: the head for your list.
	+ * @member: the name of the list_struct within the struct.
	+ */
	+#define list_for_each_entry_safe(pos, n, head, member) \
	+ for (pos = list_entry((head)->next, typeof(*pos), member), \
	+ n = list_entry(pos->member.next, typeof(*pos), member); \
	+ &pos->member != (head); \
	+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
	+
	+#endif
	+
	Index: contrib/ofed/libmlx4/src/mlx4-abi.h
	===================================================================
	--- contrib/ofed/libmlx4/src/mlx4-abi.h
	+++ contrib/ofed/libmlx4/src/mlx4-abi.h
	@@ -35,14 +35,22 @@

	#include <infiniband/kern-abi.h>

	-#define MLX4_UVERBS_MIN_ABI_VERSION 2
	+#define MLX4_UVERBS_MIN_ABI_VERSION 3
	#define MLX4_UVERBS_MAX_ABI_VERSION 4

	+enum {
	+ MLX4_USER_DEV_CAP_64B_CQE = 1L << 0,
	+#ifdef MLX4_WQE_FORMAT
	+ MLX4_USER_DEV_CAP_WQE_FORMAT = 1L << 1
	+#endif
	+};
	+
	struct mlx4_alloc_ucontext_resp_v3 {
	struct ibv_get_context_resp ibv_resp;
	__u32 qp_tab_size;
	__u16 bf_reg_size;
	__u16 bf_regs_per_page;
	+ __u32 cqe_size;
	};

	struct mlx4_alloc_ucontext_resp {
	@@ -54,6 +62,14 @@
	__u32 cqe_size;
	};

	+struct mlx4_alloc_ucontext_req {
	+ struct ibv_get_context cmd;
	+#ifdef MLX4_WQE_FORMAT
	+ __u32 lib_caps;
	+ __u32 reserved;
	+#endif
	+};
	+
	struct mlx4_alloc_pd_resp {
	struct ibv_alloc_pd_resp ibv_resp;
	__u32 pdn;
	@@ -77,16 +93,14 @@
	__u64 buf_addr;
	};

	-#ifdef HAVE_IBV_XRC_OPS
	-struct mlx4_create_xrc_srq {
	- struct ibv_create_xrc_srq ibv_cmd;
	+struct mlx4_create_srq {
	+ struct ibv_create_srq ibv_cmd;
	__u64 buf_addr;
	__u64 db_addr;
	};
	-#endif

	-struct mlx4_create_srq {
	- struct ibv_create_srq ibv_cmd;
	+struct mlx4_create_xsrq {
	+ struct ibv_create_xsrq ibv_cmd;
	__u64 buf_addr;
	__u64 db_addr;
	};
	@@ -97,8 +111,7 @@
	__u32 reserved;
	};

	-struct mlx4_create_qp {
	- struct ibv_create_qp ibv_cmd;
	+struct mlx4_create_qp_base {
	__u64 buf_addr;
	__u64 db_addr;
	__u8 log_sq_bb_count;
	@@ -107,12 +120,14 @@
	__u8 reserved[5];
	};

	-#ifdef HAVE_IBV_XRC_OPS
	-struct mlx4_open_xrc_domain_resp {
	- struct ibv_open_xrc_domain_resp ibv_resp;
	- __u32 xrcdn;
	- __u32 reserved;
	+struct mlx4_exp_create_qp_provider {
	+ struct mlx4_create_qp_base base;
	+ __u64 uar_virt_add;
	+};
	+
	+struct mlx4_create_qp {
	+ struct ibv_create_qp ibv_cmd;
	+ struct mlx4_create_qp_base base;
	};
	-#endif

	#endif /* MLX4_ABI_H */
	Index: contrib/ofed/libmlx4/src/mlx4.h
	===================================================================
	--- contrib/ofed/libmlx4/src/mlx4.h
	+++ contrib/ofed/libmlx4/src/mlx4.h
	@@ -34,10 +34,32 @@
	#ifndef MLX4_H
	#define MLX4_H

	+#include <stdio.h>
	#include <stddef.h>
	+#include <sys/ioctl.h>
	+#include <netinet/in.h>

	#include <infiniband/driver.h>
	+#include <infiniband/driver_exp.h>
	#include <infiniband/arch.h>
	+#include <infiniband/verbs.h>
	+#include <infiniband/verbs_exp.h>
	+
	+#define MLX4_MMAP_CMD_BITS 8
	+#define MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD 2
	+#define MLX4_IB_MMAP_GET_HW_CLOCK 3
	+
	+/* Use EXP mmap commands until it is pushed to upstream */
	+#define MLX4_IB_EXP_MMAP_EXT_UAR_PAGE 0xFE
	+#define MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE 0xFF
	+
	+#define MLX4_IB_MMAP_CMD_MASK 0xFF
	+#define MLX4_CQ_PREFIX "MLX_CQ"
	+#define MLX4_QP_PREFIX "MLX_QP"
	+#define MLX4_MR_PREFIX "MLX_MR"
	+#define MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE 23
	+#define MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE 12
	+#define MLX4_PORTS_NUM 2

	#ifdef HAVE_VALGRIND_MEMCHECK_H

	@@ -69,7 +91,7 @@

	#if defined(__i386__)
	#define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
	-#elif defined(__x86_64__)
	+#elif defined(__amd64__)
	#define wc_wmb() asm volatile("sfence" ::: "memory")
	#elif defined(__ia64__)
	#define wc_wmb() asm volatile("fwb" ::: "memory")
	@@ -79,29 +101,93 @@

	#endif

	-#ifndef HAVE_IBV_MORE_OPS
	-#undef HAVE_IBV_XRC_OPS
	-#undef HAVE_IBV_CREATE_QP_EXP
	-#endif
	-
	#define HIDDEN __attribute__((visibility ("hidden")))

	+#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
	+
	+#if MLX4_GCC_VERSION >= 403
	+# define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64)))
	+# define __MLX4_ALGN_DATA__ __attribute__((aligned(64)))
	+#else
	+# define __MLX4_ALGN_FUNC__
	+# define __MLX4_ALGN_DATA__
	+#endif
	+
	#define PFX "mlx4: "

	#ifndef max
	-#define max(a,b) \
	+#define max(a, b) \
	({ typeof (a) _a = (a); \
	typeof (b) _b = (b); \
	_a > _b ? _a : _b; })
	#endif

	#ifndef min
	-#define min(a,b) \
	+#define min(a, b) \
	({ typeof (a) _a = (a); \
	typeof (b) _b = (b); \
	_a < _b ? _a : _b; })
	#endif

	+#ifndef likely
	+#ifdef __GNUC__
	+#define likely(x) __builtin_expect(!!(x),1)
	+#else
	+#define likely(x) (x)
	+#endif
	+#endif
	+
	+
	+#ifndef unlikely
	+#ifdef __GNUC__
	+#define unlikely(x) __builtin_expect(!!(x), 0)
	+#else
	+#define unlikely(x) (x)
	+#endif
	+#endif
	+
	+#ifndef uninitialized_var
	+#define uninitialized_var(x) x = x
	+#endif
	+
	+#include "list.h"
	+
	+/****************************************/
	+/* ioctl codes */
	+/****************************************/
	+#define MLX4_IOC_MAGIC 'm'
	+#define MLX4_IOCHWCLOCKOFFSET _IOR(MLX4_IOC_MAGIC, 1, int)
	+
	+/* Generic macro to convert MLX4 to IBV flags. */
	+#define MLX4_TRANSPOSE(val, from, to) \
	+ (((from) >= (to)) ? \
	+ (((val) & (from)) / ((from) / (to))) : \
	+ (((val) & (from)) * ((to) / (from))))
	+
	+static inline uint64_t mlx4_transpose_uint16_t(uint16_t val, uint16_t from, uint64_t to)
	+{
	+ return MLX4_TRANSPOSE(val, from, to);
	+}
	+
	+static inline uint64_t mlx4_transpose_uint32_t(uint32_t val, uint32_t from, uint64_t to)
	+{
	+ return MLX4_TRANSPOSE(val, from, to);
	+}
	+
	+static inline uint32_t mlx4_transpose(uint32_t val, uint32_t from, uint32_t to)
	+{
	+ return MLX4_TRANSPOSE(val, from, to);
	+}
	+
	+enum {
	+ MLX4_MAX_FAMILY_VER = 0
	+};
	+
	+enum {
	+ MLX4_MAX_BFS_IN_PAGE = 8,
	+ MLX4_BFS_STRIDE = 512,
	+};
	+
	enum {
	MLX4_STAT_RATE_OFFSET = 5
	};
	@@ -112,14 +198,86 @@
	MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
	};

	+#define MLX4_REMOTE_SRQN_FLAGS(wr) htonl(wr->qp_type.xrc.remote_srqn << 8)
	+
	enum {
	- MLX4_XRC_SRQ_TABLE_BITS = 8,
	- MLX4_XRC_SRQ_TABLE_SIZE = 1 << MLX4_XRC_SRQ_TABLE_BITS,
	- MLX4_XRC_SRQ_TABLE_MASK = MLX4_XRC_SRQ_TABLE_SIZE - 1
	+ MLX4_XSRQ_TABLE_BITS = 8,
	+ MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
	+ MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
	};

	enum {
	- MLX4_XRC_QPN_BIT = (1 << 23)
	+ MLX4_QP_PATTERN = 0x012389AB,
	+ MLX4_CQ_PATTERN = 0x4567CDEF
	+};
	+
	+enum mlx4_lock_type {
	+ MLX4_SPIN_LOCK = 0,
	+ MLX4_MUTEX = 1,
	+};
	+
	+enum mlx4_lock_state {
	+ MLX4_USE_LOCK,
	+ MLX4_LOCKED,
	+ MLX4_UNLOCKED
	+};
	+
	+/* QP DoorBell ringing methods */
	+enum mlx4_db_method {
	+ MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB,/* QP has dedicated BF, */
	+ /* only one thread is using this QP, */
	+ /* the arch supports WC auto evict and */
	+ /* prefer_bf flag is set. */
	+ /* This means that there is no need for */
	+ /* wc_wmb to flush the WC buffer */
	+ MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB, /* Same as previous but prefer_bf */
	+ /* flag is not set */
	+ MLX4_QP_DB_METHOD_DEDIC_BF, /* QP has dedicated BF */
	+ MLX4_QP_DB_METHOD_BF, /* QP has BF which may be shared with other QPs */
	+ MLX4_QP_DB_METHOD_DB /* BF is not valid for this QP, use DoorBell to send the messages */
	+};
	+
	+enum mlx4_res_domain_bf_type {
	+ MLX4_RES_DOMAIN_BF_NONE, /* No BF for this resource domain */
	+ MLX4_RES_DOMAIN_BF_SAFE, /* Use BF when possible */
	+ MLX4_RES_DOMAIN_BF_UNSAFE, /* Use BF when possible. */
	+ /* The application is responsible to sync between */
	+ /* calls to objects using this resource domain. */
	+ /* This means that there is no need to use the BF */
	+ /* lock. */
	+ MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT, /* Use BF when possible. */
	+ /* Only one thread is using this resource */
	+ /* and the arch supports WC auto-evict. */
	+ /* This means that there is no need to use */
	+ /* wc_wmb function to flush the BF buffer */
	+
	+};
	+
	+struct mlx4_xsrq_table {
	+ struct {
	+ struct mlx4_srq **table;
	+ int refcnt;
	+ } xsrq_table[MLX4_XSRQ_TABLE_SIZE];
	+
	+ pthread_mutex_t mutex;
	+ int num_xsrq;
	+ int shift;
	+ int mask;
	+};
	+
	+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
	+struct mlx4_srq mlx4_find_xsrq(struct mlx4_xsrq_table xsrq_table, uint32_t srqn);
	+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
	+ struct mlx4_srq *srq);
	+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
	+
	+enum {
	+ MLX4_XRC_QPN_BIT = (1 << 23)
	+};
	+
	+enum qp_cap_cache {
	+ /* The flag below includes VXLAN support as well in mlx4 HW*/
	+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP = 1 << 1
	};

	enum mlx4_db_type {
	@@ -128,6 +286,15 @@
	MLX4_NUM_DB_TYPE
	};

	+enum mlx4_alloc_type {
	+ MLX4_ALLOC_TYPE_ANON,
	+ MLX4_ALLOC_TYPE_HUGE,
	+ MLX4_ALLOC_TYPE_CONTIG,
	+ MLX4_ALLOC_TYPE_PREFER_HUGE,
	+ MLX4_ALLOC_TYPE_PREFER_CONTIG,
	+ MLX4_ALLOC_TYPE_ALL
	+};
	+
	enum {
	MLX4_OPCODE_NOP = 0x00,
	MLX4_OPCODE_SEND_INVAL = 0x01,
	@@ -146,6 +313,12 @@
	MLX4_OPCODE_LOCAL_INVAL = 0x1b,
	MLX4_OPCODE_CONFIG_CMD = 0x1f,

	+ MLX4_OPCODE_SEND_ENABLE = 0x17,
	+ MLX4_OPCODE_RECV_ENABLE = 0x16,
	+ MLX4_OPCODE_CQE_WAIT = 0x0f,
	+ MLX4_OPCODE_CALC_SEND = 0x1e,
	+ MLX4_OPCODE_CALC_RDMA_WRITE_IMM = 0x1f,
	+
	MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00,
	MLX4_RECV_OPCODE_SEND = 0x01,
	MLX4_RECV_OPCODE_SEND_IMM = 0x02,
	@@ -155,28 +328,86 @@
	MLX4_CQE_OPCODE_RESIZE = 0x16,
	};

	+extern int mlx4_stall_num_loop;
	+extern int mlx4_trace;
	+extern int mlx4_single_threaded;
	+extern int mlx4_use_mutex;
	+
	enum {
	MLX4_MAX_WQE_SIZE = 1008
	};

	struct mlx4_device {
	- struct ibv_device ibv_dev;
	+ struct verbs_device verbs_dev;
	int page_size;
	- int driver_abi_ver;
	+
	+ struct {
	+ unsigned id;
	+ unsigned short rev;
	+ } devid;
	+ int driver_abi_ver;
	};

	struct mlx4_db_page;

	+struct mlx4_lock {
	+ pthread_mutex_t mutex;
	+ pthread_spinlock_t slock;
	+ enum mlx4_lock_state state;
	+ enum mlx4_lock_type type;
	+};
	+
	+struct mlx4_spinlock {
	+ pthread_spinlock_t lock;
	+ enum mlx4_lock_state state;
	+};
	+
	+/* struct for BF dedicated for one QP */
	+struct mlx4_dedic_bf {
	+ void *address;
	+};
	+
	+/* struct for the common BF which may be shared by many QPs */
	+struct mlx4_cmn_bf {
	+ void *address;
	+ /*
	+ * Protect usage of BF address field including data written to the BF
	+ * and the BF buffer toggling.
	+ */
	+ struct mlx4_lock lock;
	+};
	+
	+union mlx4_bf {
	+ struct mlx4_dedic_bf dedic;
	+ struct mlx4_cmn_bf cmn;
	+};
	+
	+struct mlx4_bfs_data {
	+ struct mlx4_dedic_bf dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1];
	+ struct mlx4_cmn_bf cmn_bf;
	+ uint8_t dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1];
	+ uint8_t dedic_bf_free;
	+ struct mlx4_spinlock dedic_bf_lock; /* protect dedicated BFs managing */
	+ /* including dedic_bf_used and */
	+ /* dedic_bf_free fields */
	+ void *page;
	+ uint16_t buf_size;
	+ uint8_t num_dedic_bfs;
	+};
	+
	struct mlx4_context {
	- struct ibv_context ibv_ctx;
	+ union {
	+ struct ibv_context ibv_ctx;
	+ };

	+ struct mlx4_spinlock send_db_lock; /* protects send_db_list and send_db_num_uars */
	+ struct list_head send_db_list;
	+ unsigned int send_db_num_uars;
	void *uar;
	- pthread_spinlock_t uar_lock;
	-
	- void *bf_page;
	- int bf_buf_size;
	- int bf_offset;
	- pthread_spinlock_t bf_lock;
	+ struct mlx4_spinlock uar_lock;
	+ struct mlx4_bfs_data bfs;
	+ int bf_regs_per_page;
	+ int max_ctx_res_domain;

	struct {
	struct mlx4_qp **table;
	@@ -189,24 +420,39 @@
	int max_qp_wr;
	int max_sge;
	int max_cqe;
	- int cqe_size;
	-
	+ uint64_t exp_device_cap_flags;
	struct {
	- struct mlx4_srq **table;
	- int refcnt;
	- } xrc_srq_table[MLX4_XRC_SRQ_TABLE_SIZE];
	- pthread_mutex_t xrc_srq_table_mutex;
	- int num_xrc_srqs;
	- int xrc_srq_table_shift;
	- int xrc_srq_table_mask;
	+ int offset;
	+ int mult;
	+ int shift;
	+ uint64_t mask;
	+ } core_clk;
	+ void *hca_core_clock;
	+
	+ struct mlx4_xsrq_table xsrq_table;

	struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE];
	pthread_mutex_t db_list_mutex;
	+ int cqe_size;
	+ int prefer_bf;
	+ struct mlx4_spinlock hugetlb_lock;
	+ struct list_head hugetlb_list;
	+ int stall_enable;
	+ pthread_mutex_t task_mutex;
	+ struct {
	+ uint8_t valid;
	+ uint8_t link_layer;
	+ enum ibv_port_cap_flags caps;
	+ } port_query_cache[MLX4_PORTS_NUM];
	+ pthread_mutex_t env_mtx;
	+ int env_initialized;
	};

	struct mlx4_buf {
	void *buf;
	+ void *hmem;
	size_t length;
	+ int base;
	};

	struct mlx4_pd {
	@@ -214,23 +460,40 @@
	uint32_t pdn;
	};

	+enum mlx4_cq_model_flags {
	+ /*
	+ * When set the CQ API must be thread safe.
	+ * When reset application is taking care
	+ * to sync between CQ API calls.
	+ */
	+ MLX4_CQ_MODEL_FLAG_THREAD_SAFE = 1 << 0,
	+};
	+
	struct mlx4_cq {
	- struct ibv_cq ibv_cq;
	+ struct ibv_cq ibv_cq __MLX4_ALGN_DATA__;
	+ uint32_t pattern;
	struct mlx4_buf buf;
	struct mlx4_buf resize_buf;
	- pthread_spinlock_t lock;
	+ struct mlx4_lock lock;
	uint32_t cqn;
	uint32_t cons_index;
	+ uint32_t wait_index;
	+ uint32_t wait_count;
	uint32_t *set_ci_db;
	uint32_t *arm_db;
	int arm_sn;
	- int cqe_size;
	+ int stall_next_poll;
	+ int stall_enable;
	+ int cqe_size;
	+ int creation_flags;
	+ struct mlx4_qp *last_qp;
	+ uint32_t model_flags; /* use mlx4_cq_model_flags */
	};

	struct mlx4_srq {
	- struct ibv_srq ibv_srq;
	+ struct verbs_srq verbs_srq;
	struct mlx4_buf buf;
	- pthread_spinlock_t lock;
	+ struct mlx4_spinlock lock;
	uint64_t *wrid;
	uint32_t srqn;
	int max;
	@@ -240,33 +503,102 @@
	int tail;
	uint32_t *db;
	uint16_t counter;
	+ uint8_t ext_srq;
	+ struct ibv_srq_legacy *ibv_srq_legacy;
	};

	struct mlx4_wq {
	uint64_t *wrid;
	- pthread_spinlock_t lock;
	+ struct mlx4_lock lock;
	int wqe_cnt;
	int max_post;
	+ char *buf;
	unsigned head;
	unsigned tail;
	int max_gs;
	int wqe_shift;
	- int offset;
	+
	+ /* SEND/RECV_ENABLE data */
	+ unsigned head_en_index;
	+ unsigned head_en_count;
	+};
	+
	+/* enclosing ibv_mr adding some extra managing information */
	+struct mlx4_mr {
	+ struct ibv_mr ibv_mr;
	+ struct mlx4_buf buf;
	+ uint64_t allocation_flags;
	+ int shared_mr;
	+};
	+
	+
	+struct mlx4_inlr_rbuff {
	+ void *rbuff;
	+ int rlen;
	+};
	+
	+struct mlx4_inlr_sg_list {
	+ struct mlx4_inlr_rbuff *sg_list;
	+ int list_len;
	+};
	+
	+struct mlx4_inlr_buff {
	+ struct mlx4_inlr_sg_list *buff;
	+ int len;
	+};
	+
	+struct mlx4_send_db_data {
	+ union mlx4_bf bf;
	+ uint32_t db_addr; / Points to the BF related send DB */
	+ struct list_head list;
	+};
	+
	+enum mlx4_qp_model_flags {
	+ /*
	+ * When set the QP API must be thread safe.
	+ * When reset application is taking care
	+ * to sync between QP API calls.
	+ */
	+ MLX4_QP_MODEL_FLAG_THREAD_SAFE = 1 << 0,
	};

	struct mlx4_qp {
	- struct ibv_qp ibv_qp;
	- struct mlx4_buf buf;
	- int max_inline_data;
	+ struct verbs_qp verbs_qp;
	+ uint32_t pattern;
	int buf_size;
	-
	+ uint32_t model_flags; /* use mlx4_qp_model_flags */
	+
	+ /* hot post send data */
	+ struct mlx4_wq sq __MLX4_ALGN_DATA__;
	+ int (post_send_one)(struct ibv_send_wr wr,
	+ struct mlx4_qp *qp,
	+ void wqe, int total_size,
	+ int *inl, unsigned int ind);
	+ union mlx4_bf *bf;
	+ uint32_t sdb; / send DB */
	+ struct mlx4_buf buf;
	+ unsigned last_db_head;
	uint32_t doorbell_qpn;
	- uint32_t sq_signal_bits;
	- int sq_spare_wqes;
	- struct mlx4_wq sq;
	-
	+ uint32_t create_flags;
	+ uint16_t max_inline_data;
	+ uint16_t bf_buf_size;
	+ uint16_t sq_spare_wqes;
	+ uint8_t srcrb_flags_tbl[16];
	+ uint8_t db_method;
	+ uint8_t qp_type;
	+ /* RAW_PACKET hot data */
	+ uint8_t link_layer;
	+ /* EXT_MASKED_ATOMIC hot data */
	+ uint8_t is_masked_atomic;
	+
	+ /* post receive hot data */
	+ struct mlx4_wq rq __MLX4_ALGN_DATA__;
	uint32_t *db;
	- struct mlx4_wq rq;
	+ uint32_t max_inlr_sg;
	+ int32_t cached_rx_csum_flags;
	+ int32_t transposed_rx_csum_flags;
	+ struct mlx4_inlr_buff inlr_buff;
	+ uint8_t qp_cap_cache;
	};

	struct mlx4_av {
	@@ -280,7 +612,6 @@
	uint8_t hop_limit;
	uint32_t sl_tclass_flowlabel;
	uint8_t dgid[16];
	- uint8_t mac[8];
	};

	struct mlx4_ah {
	@@ -288,18 +619,20 @@
	struct mlx4_av av;
	uint16_t vlan;
	uint8_t mac[6];
	- uint8_t tagged;
	};

	-struct mlx4_xrc_domain {
	- struct ibv_xrc_domain ibv_xrcd;
	- uint32_t xrcdn;
	+struct mlx4_res_domain {
	+ struct ibv_exp_res_domain ibv_res_domain;
	+ struct ibv_exp_res_domain_init_attr attr;
	+ enum mlx4_res_domain_bf_type type;
	+ struct mlx4_send_db_data *send_db;
	};

	static inline unsigned long align(unsigned long val, unsigned long align)
	{
	return (val + align - 1) & ~(align - 1);
	}
	+int align_queue_size(int req);

	#define to_mxxx(xxx, type) \
	((struct mlx4_##type *) \
	@@ -307,7 +640,10 @@

	static inline struct mlx4_device to_mdev(struct ibv_device ibdev)
	{
	- return to_mxxx(dev, device);
	+ /* ibv_device is first field of verbs_device
	+ * see try_driver in libibverbs.
	+ */
	+ return container_of(ibdev, struct mlx4_device, verbs_dev);
	}

	static inline struct mlx4_context to_mctx(struct ibv_context ibctx)
	@@ -327,32 +663,53 @@

	static inline struct mlx4_srq to_msrq(struct ibv_srq ibsrq)
	{
	- return to_mxxx(srq, srq);
	+ return container_of(container_of(ibsrq, struct verbs_srq, srq),
	+ struct mlx4_srq, verbs_srq);
	}

	static inline struct mlx4_qp to_mqp(struct ibv_qp ibqp)
	{
	- return to_mxxx(qp, qp);
	+ return container_of(container_of(ibqp, struct verbs_qp, qp),
	+ struct mlx4_qp, verbs_qp);
	}

	+static inline struct mlx4_mr to_mmr(struct ibv_mr ibmr)
	+{
	+ return to_mxxx(mr, mr);
	+}
	static inline struct mlx4_ah to_mah(struct ibv_ah ibah)
	{
	return to_mxxx(ah, ah);
	}

	-#ifdef HAVE_IBV_XRC_OPS
	-static inline struct mlx4_xrc_domain to_mxrcd(struct ibv_xrc_domain ibxrcd)
	+static inline struct mlx4_res_domain to_mres_domain(struct ibv_exp_res_domain ibres_domain)
	{
	- return to_mxxx(xrcd, xrc_domain);
	+ return to_mxxx(res_domain, res_domain);
	}
	-#endif

	+int update_port_data(struct ibv_qp *qp, uint8_t port_num);
	int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
	void mlx4_free_buf(struct mlx4_buf *buf);
	+int mlx4_alloc_buf_huge(struct mlx4_context mctx, struct mlx4_buf buf,
	+ size_t size, int page_size);
	+int mlx4_alloc_buf_contig(struct mlx4_context mctx, struct mlx4_buf buf,
	+ size_t size, int page_size, const char component, void req_addr);
	+int mlx4_alloc_prefered_buf(struct mlx4_context *mctx,
	+ struct mlx4_buf *buf,
	+ size_t size, int page_size,
	+ enum mlx4_alloc_type alloc_type,
	+ const char *component);
	+void mlx4_get_alloc_type(struct ibv_context context, const char component,
	+ enum mlx4_alloc_type *alloc_type,
	+ enum mlx4_alloc_type default_alloc_type);
	+void mlx4_free_buf_huge(struct mlx4_context mctx, struct mlx4_buf buf);
	+int mlx4_use_huge(struct ibv_context context, const char key);

	uint32_t mlx4_alloc_db(struct mlx4_context context, enum mlx4_db_type type);
	void mlx4_free_db(struct mlx4_context context, enum mlx4_db_type type, uint32_t db);

	+int __mlx4_query_device(uint64_t raw_fw_ver,
	+ struct ibv_device_attr *attr);
	int mlx4_query_device(struct ibv_context *context,
	struct ibv_device_attr *attr);
	int mlx4_query_port(struct ibv_context *context, uint8_t port,
	@@ -360,19 +717,42 @@

	struct ibv_pd mlx4_alloc_pd(struct ibv_context context);
	int mlx4_free_pd(struct ibv_pd *pd);
	+struct ibv_xrcd mlx4_open_xrcd(struct ibv_context context,
	+ struct ibv_xrcd_init_attr *attr);
	+int mlx4_close_xrcd(struct ibv_xrcd *xrcd);

	struct ibv_mr mlx4_reg_mr(struct ibv_pd pd, void *addr,
	- size_t length, enum ibv_access_flags access);
	+ size_t length, int access);
	+struct ibv_mr mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in in);
	+int mlx4_exp_post_send(struct ibv_qp ibqp, struct ibv_exp_send_wr wr,
	+ struct ibv_exp_send_wr **bad_wr);
	+void mlx4_update_post_send_one(struct mlx4_qp *qp);
	+struct ibv_exp_qp_burst_family mlx4_get_qp_burst_family(struct mlx4_qp qp,
	+ struct ibv_exp_query_intf_params *params,
	+ enum ibv_exp_query_intf_status *status);
	+struct ibv_exp_cq_family mlx4_get_poll_cq_family(struct mlx4_cq cq,
	+ struct ibv_exp_query_intf_params *params,
	+ enum ibv_exp_query_intf_status *status);
	+
	+struct ibv_mr mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in in);
	int mlx4_dereg_mr(struct ibv_mr *mr);

	+struct ibv_mw mlx4_alloc_mw(struct ibv_pd pd, enum ibv_mw_type type);
	+int mlx4_dealloc_mw(struct ibv_mw *mw);
	+int mlx4_bind_mw(struct ibv_qp qp, struct ibv_mw mw,
	+ struct ibv_mw_bind *mw_bind);
	+int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind);
	+
	struct ibv_cq mlx4_create_cq(struct ibv_context context, int cqe,
	struct ibv_comp_channel *channel,
	int comp_vector);
	-int mlx4_alloc_cq_buf(struct mlx4_device dev, struct mlx4_buf buf, int nent,
	+int mlx4_alloc_cq_buf(struct mlx4_context mctx, struct mlx4_buf buf, int nent,
	int entry_size);
	int mlx4_resize_cq(struct ibv_cq *cq, int cqe);
	int mlx4_destroy_cq(struct ibv_cq *cq);
	-int mlx4_poll_cq(struct ibv_cq cq, int ne, struct ibv_wc wc);
	+int mlx4_poll_ibv_cq(struct ibv_cq cq, int ne, struct ibv_wc wc);
	+int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries,
	+ struct ibv_exp_wc *wc, uint32_t wc_size) __MLX4_ALGN_FUNC__;
	int mlx4_arm_cq(struct ibv_cq *cq, int solicited);
	void mlx4_cq_event(struct ibv_cq *cq);
	void __mlx4_cq_clean(struct mlx4_cq cq, uint32_t qpn, struct mlx4_srq srq);
	@@ -382,76 +762,207 @@

	struct ibv_srq mlx4_create_srq(struct ibv_pd pd,
	struct ibv_srq_init_attr *attr);
	+struct ibv_srq mlx4_create_srq_ex(struct ibv_context context,
	+ struct ibv_srq_init_attr_ex *attr_ex);
	+struct ibv_srq mlx4_create_xrc_srq(struct ibv_context context,
	+ struct ibv_srq_init_attr_ex *attr_ex);
	int mlx4_modify_srq(struct ibv_srq *srq,
	struct ibv_srq_attr *attr,
	- enum ibv_srq_attr_mask mask);
	+ int mask);
	int mlx4_query_srq(struct ibv_srq *srq,
	struct ibv_srq_attr *attr);
	int mlx4_destroy_srq(struct ibv_srq *srq);
	+int mlx4_destroy_xrc_srq(struct ibv_srq *srq);
	int mlx4_alloc_srq_buf(struct ibv_pd pd, struct ibv_srq_attr attr,
	struct mlx4_srq *srq);
	+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
	+struct mlx4_srq mlx4_find_xsrq(struct mlx4_xsrq_table xsrq_table, uint32_t srqn);
	+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
	+ struct mlx4_srq *srq);
	+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
	void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
	int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
	struct ibv_recv_wr *wr,
	struct ibv_recv_wr **bad_wr);
	-struct mlx4_srq mlx4_find_xrc_srq(struct mlx4_context ctx, uint32_t xrc_srqn);
	-int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
	- struct mlx4_srq *srq);
	-void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);

	struct ibv_qp mlx4_create_qp(struct ibv_pd pd, struct ibv_qp_init_attr *attr);
	+struct ibv_qp mlx4_create_qp_ex(struct ibv_context context,
	+ struct ibv_qp_init_attr_ex *attr);
	+int mlx4_modify_cq(struct ibv_cq cq, struct ibv_exp_cq_attr attr, int attr_mask);
	+int mlx4_post_task(struct ibv_context *context,
	+ struct ibv_exp_task *task_list,
	+ struct ibv_exp_task **bad_task);
	+struct ibv_qp mlx4_open_qp(struct ibv_context context, struct ibv_qp_open_attr *attr);
	int mlx4_query_qp(struct ibv_qp qp, struct ibv_qp_attr attr,
	- enum ibv_qp_attr_mask attr_mask,
	+ int attr_mask,
	struct ibv_qp_init_attr *init_attr);
	int mlx4_modify_qp(struct ibv_qp qp, struct ibv_qp_attr attr,
	- enum ibv_qp_attr_mask attr_mask);
	+ int attr_mask);
	+int mlx4_exp_modify_qp(struct ibv_qp qp, struct ibv_exp_qp_attr attr,
	+ uint64_t attr_mask);
	int mlx4_destroy_qp(struct ibv_qp *qp);
	+void mlx4_get_recv_wqe(struct mlx4_qp qp, int n);
	void mlx4_init_qp_indices(struct mlx4_qp *qp);
	void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp);
	int mlx4_post_send(struct ibv_qp ibqp, struct ibv_send_wr wr,
	- struct ibv_send_wr **bad_wr);
	+ struct ibv_send_wr **bad_wr) __MLX4_ALGN_FUNC__;
	int mlx4_post_recv(struct ibv_qp ibqp, struct ibv_recv_wr wr,
	- struct ibv_recv_wr **bad_wr);
	+ struct ibv_recv_wr **bad_wr) __MLX4_ALGN_FUNC__;
	void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
	struct mlx4_qp *qp);
	int num_inline_segs(int data, enum ibv_qp_type type);
	-int mlx4_alloc_qp_buf(struct ibv_pd pd, struct ibv_qp_cap cap,
	- enum ibv_qp_type type, struct mlx4_qp *qp);
	+void mlx4_dealloc_qp_buf(struct ibv_context context, struct mlx4_qp qp);
	void mlx4_set_sq_sizes(struct mlx4_qp qp, struct ibv_qp_cap cap,
	enum ibv_qp_type type);
	struct mlx4_qp mlx4_find_qp(struct mlx4_context ctx, uint32_t qpn);
	int mlx4_store_qp(struct mlx4_context ctx, uint32_t qpn, struct mlx4_qp qp);
	void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn);
	+struct ibv_ah mlx4_create_ah_common(struct ibv_pd pd,
	+ struct ibv_ah_attr *attr,
	+ uint8_t link_layer);
	struct ibv_ah mlx4_create_ah(struct ibv_pd pd, struct ibv_ah_attr *attr);
	+struct ibv_ah mlx4_exp_create_ah(struct ibv_pd pd,
	+ struct ibv_exp_ah_attr *attr_ex);
	int mlx4_destroy_ah(struct ibv_ah *ah);
	int mlx4_alloc_av(struct mlx4_pd pd, struct ibv_ah_attr attr,
	struct mlx4_ah *ah);
	void mlx4_free_av(struct mlx4_ah *ah);
	-#ifdef HAVE_IBV_XRC_OPS
	-struct ibv_srq mlx4_create_xrc_srq(struct ibv_pd pd,
	- struct ibv_xrc_domain *xrc_domain,
	- struct ibv_cq *xrc_cq,
	- struct ibv_srq_init_attr *attr);
	-struct ibv_xrc_domain mlx4_open_xrc_domain(struct ibv_context context,
	- int fd, int oflag);
	-
	-int mlx4_close_xrc_domain(struct ibv_xrc_domain *d);
	-int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr,
	- uint32_t *xrc_qp_num);
	-int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num,
	- struct ibv_qp_attr *attr,
	- int attr_mask);
	-int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num,
	- struct ibv_qp_attr *attr,
	- int attr_mask,
	- struct ibv_qp_init_attr *init_attr);
	-int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num);
	-int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num);
	-#endif
	+struct ibv_cq mlx4_create_cq_ex(struct ibv_context context,
	+ int cqe,
	+ struct ibv_comp_channel *channel,
	+ int comp_vector,
	+ struct ibv_exp_cq_init_attr *attr);
	+int mlx4_query_values(struct ibv_context *context, int q_values,
	+ struct ibv_exp_values *values);
	+void mlx4_get_legacy_xrc(struct ibv_srq srq);
	+void mlx4_set_legacy_xrc(struct ibv_srq srq, void legacy_xrc_srq);
	+void read_init_vars(struct mlx4_context *ctx);
	+
	+static inline enum mlx4_lock_type mlx4_get_locktype(void)
	+{
	+ if (!mlx4_use_mutex)
	+ return MLX4_SPIN_LOCK;
	+
	+ return MLX4_MUTEX;
	+}
	+
	+static inline int mlx4_spin_lock(struct mlx4_spinlock *lock)
	+{
	+ if (lock->state == MLX4_USE_LOCK)
	+ return pthread_spin_lock(&lock->lock);
	+
	+ if (unlikely(lock->state == MLX4_LOCKED)) {
	+ fprintf(stderr, "* ERROR: multithreading violation *\n"
	+ "You are running a multithreaded application but\n"
	+ "you set MLX4_SINGLE_THREADED=1. Please unset it.\n");
	+ abort();
	+ } else {
	+ lock->state = MLX4_LOCKED;
	+ wmb();
	+ }
	+
	+ return 0;
	+}
	+
	+static inline int mlx4_spin_unlock(struct mlx4_spinlock *lock)
	+{
	+ if (lock->state == MLX4_USE_LOCK)
	+ return pthread_spin_unlock(&lock->lock);
	+
	+ lock->state = MLX4_UNLOCKED;
	+
	+ return 0;
	+}
	+
	+static inline int mlx4_lock(struct mlx4_lock *lock)
	+{
	+ if (lock->state == MLX4_USE_LOCK) {
	+ if (lock->type == MLX4_SPIN_LOCK)
	+ return pthread_spin_lock(&lock->slock);
	+
	+ return pthread_mutex_lock(&lock->mutex);
	+ }
	+
	+ if (unlikely(lock->state == MLX4_LOCKED)) {
	+ fprintf(stderr, "* ERROR: multithreading violation *\n"
	+ "You are running a multithreaded application but\n"
	+ "you set MLX4_SINGLE_THREADED=1. Please unset it.\n");
	+ abort();
	+ } else {
	+ lock->state = MLX4_LOCKED;
	+ /* Make new state visable to other threads. */
	+ wmb();
	+ }
	+
	+ return 0;
	+}
	+
	+static inline int mlx4_unlock(struct mlx4_lock *lock)
	+{
	+ if (lock->state == MLX4_USE_LOCK) {
	+ if (lock->type == MLX4_SPIN_LOCK)
	+ return pthread_spin_unlock(&lock->slock);
	+
	+ return pthread_mutex_unlock(&lock->mutex);
	+ }
	+ lock->state = MLX4_UNLOCKED;
	+
	+ return 0;
	+}

	+static inline int mlx4_spinlock_init(struct mlx4_spinlock *lock, int use_spinlock)
	+{
	+ if (use_spinlock) {
	+ lock->state = MLX4_USE_LOCK;
	+ return pthread_spin_init(&lock->lock, PTHREAD_PROCESS_PRIVATE);
	+ }
	+ lock->state = MLX4_UNLOCKED;
	+
	+ return 0;
	+}
	+
	+static inline int mlx4_spinlock_destroy(struct mlx4_spinlock *lock)
	+{
	+ if (lock->state == MLX4_USE_LOCK)
	+ return pthread_spin_destroy(&lock->lock);
	+
	+ return 0;
	+}
	+
	+static inline int mlx4_lock_init(struct mlx4_lock *lock,
	+ int use_lock,
	+ enum mlx4_lock_type lock_type)
	+{
	+ if (use_lock) {
	+ lock->type = lock_type;
	+ lock->state = MLX4_USE_LOCK;
	+ if (lock->type == MLX4_SPIN_LOCK)
	+ return pthread_spin_init(&lock->slock,
	+ PTHREAD_PROCESS_PRIVATE);
	+
	+ return pthread_mutex_init(&lock->mutex,
	+ PTHREAD_PROCESS_PRIVATE);
	+ }
	+ lock->state = MLX4_UNLOCKED;
	+
	+ return 0;
	+}
	+
	+static inline int mlx4_lock_destroy(struct mlx4_lock *lock)
	+{
	+ if (lock->state == MLX4_USE_LOCK) {
	+ if (lock->type == MLX4_SPIN_LOCK)
	+ return pthread_spin_destroy(&lock->slock);
	+
	+ return pthread_mutex_destroy(&lock->mutex);
	+ }
	+
	+ return 0;
	+}
	+
	+static inline void mlx4_update_cons_index(struct mlx4_cq *cq)
	+{
	+ *cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
	+}

	#endif /* MLX4_H */
	Index: contrib/ofed/libmlx4/src/mlx4.c
	===================================================================
	--- contrib/ofed/libmlx4/src/mlx4.c
	+++ contrib/ofed/libmlx4/src/mlx4.c
	@@ -41,18 +41,27 @@
	#include <sys/mman.h>
	#include <pthread.h>
	#include <string.h>
	-
	+#include <sys/stat.h>
	+#include <sys/param.h>
	+#include <sched.h>
	#ifndef HAVE_IBV_REGISTER_DRIVER
	#include <sysfs/libsysfs.h>
	#endif
	+#include <sys/cpuset.h>

	#include "mlx4.h"
	#include "mlx4-abi.h"
	+#include "mlx4_exp.h"
	+

	#ifndef PCI_VENDOR_ID_MELLANOX
	#define PCI_VENDOR_ID_MELLANOX 0x15b3
	#endif

	+int mlx4_trace = 0;
	+int mlx4_single_threaded = 0;
	+int mlx4_use_mutex = 0;
	+
	#define HCA(v, d) \
	{ .vendor = PCI_VENDOR_ID_##v, \
	.device = d }
	@@ -66,47 +75,30 @@
	HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */
	HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */
	HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */
	- HCA(MELLANOX, 0x6368), /* MT25448 [ConnectX EN 10GigE, PCIe 2.0 2.5GT/s] */
	- HCA(MELLANOX, 0x6750), /* MT26448 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */
	- HCA(MELLANOX, 0x6372), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe 2.0 2.5GT/s] */
	- HCA(MELLANOX, 0x675a), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe Gen2 5GT/s] */
	- HCA(MELLANOX, 0x6764), /* MT26468 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */
	- HCA(MELLANOX, 0x6746), /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
	- HCA(MELLANOX, 0x676e), /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
	- HCA(MELLANOX, 0x6778), /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
	- HCA(MELLANOX, 0x1000),
	- HCA(MELLANOX, 0x1001),
	- HCA(MELLANOX, 0x1002),
	- HCA(MELLANOX, 0x1003),
	- HCA(MELLANOX, 0x1004),
	- HCA(MELLANOX, 0x1005),
	- HCA(MELLANOX, 0x1006),
	- HCA(MELLANOX, 0x1007),
	- HCA(MELLANOX, 0x1008),
	- HCA(MELLANOX, 0x1009),
	- HCA(MELLANOX, 0x100a),
	- HCA(MELLANOX, 0x100b),
	- HCA(MELLANOX, 0x100c),
	- HCA(MELLANOX, 0x100d),
	- HCA(MELLANOX, 0x100e),
	- HCA(MELLANOX, 0x100f),
	+ HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */
	+ HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
	+ HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */
	+ HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
	+ HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/
	+ HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
	+ HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */
	+ HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */
	+ HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */
	+ HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */
	+ HCA(MELLANOX, 0x1005), /* MT27510 Family */
	+ HCA(MELLANOX, 0x1006), /* MT27511 Family */
	+ HCA(MELLANOX, 0x1007), /* MT27520 Family */
	+ HCA(MELLANOX, 0x1008), /* MT27521 Family */
	+ HCA(MELLANOX, 0x1009), /* MT27530 Family */
	+ HCA(MELLANOX, 0x100a), /* MT27531 Family */
	+ HCA(MELLANOX, 0x100b), /* MT27540 Family */
	+ HCA(MELLANOX, 0x100c), /* MT27541 Family */
	+ HCA(MELLANOX, 0x100d), /* MT27550 Family */
	+ HCA(MELLANOX, 0x100e), /* MT27551 Family */
	+ HCA(MELLANOX, 0x100f), /* MT27560 Family */
	+ HCA(MELLANOX, 0x1010), /* MT27561 Family */
	};

	-#ifdef HAVE_IBV_MORE_OPS
	-static struct ibv_more_ops mlx4_more_ops = {
	-#ifdef HAVE_IBV_XRC_OPS
	- .create_xrc_srq = mlx4_create_xrc_srq,
	- .open_xrc_domain = mlx4_open_xrc_domain,
	- .close_xrc_domain = mlx4_close_xrc_domain,
	- .create_xrc_rcv_qp = mlx4_create_xrc_rcv_qp,
	- .modify_xrc_rcv_qp = mlx4_modify_xrc_rcv_qp,
	- .query_xrc_rcv_qp = mlx4_query_xrc_rcv_qp,
	- .reg_xrc_rcv_qp = mlx4_reg_xrc_rcv_qp,
	- .unreg_xrc_rcv_qp = mlx4_unreg_xrc_rcv_qp,
	-#endif
	-};
	-#endif
	-
	static struct ibv_context_ops mlx4_ctx_ops = {
	.query_device = mlx4_query_device,
	.query_port = mlx4_query_port,
	@@ -114,8 +106,11 @@
	.dealloc_pd = mlx4_free_pd,
	.reg_mr = mlx4_reg_mr,
	.dereg_mr = mlx4_dereg_mr,
	+ .alloc_mw = mlx4_alloc_mw,
	+ .dealloc_mw = mlx4_dealloc_mw,
	+ .bind_mw = mlx4_bind_mw,
	.create_cq = mlx4_create_cq,
	- .poll_cq = mlx4_poll_cq,
	+ .poll_cq = mlx4_poll_ibv_cq,
	.req_notify_cq = mlx4_arm_cq,
	.cq_event = mlx4_cq_event,
	.resize_cq = mlx4_resize_cq,
	@@ -137,150 +132,592 @@
	.detach_mcast = ibv_cmd_detach_mcast
	};

	-static struct ibv_context mlx4_alloc_context(struct ibv_device ibdev, int cmd_fd)
	+static int read_number_from_line(const char line, int value)
	{
	- struct mlx4_context *context;
	- struct ibv_get_context cmd;
	- struct mlx4_alloc_ucontext_resp resp;
	- struct mlx4_alloc_ucontext_resp_v3 resp_v3;
	- int i;
	- struct ibv_device_attr dev_attrs;
	- unsigned int bf_reg_size;
	+ const char *ptr;

	- context = calloc(1, sizeof *context);
	- if (!context)
	- return NULL;
	+ ptr = strchr(line, ':');
	+ if (!ptr)
	+ return 1;
	+
	+ ++ptr;
	+
	+ *value = atoi(ptr);
	+ return 0;
	+}
	+
	+static int mlx4_is_sandy_bridge(int *num_cores)
	+{
	+ char line[128];
	+ FILE *fd;
	+ int rc = 0;
	+ int cur_cpu_family = -1;
	+ int cur_cpu_model = -1;
	+
	+ fd = fopen("/proc/cpuinfo", "r");
	+ if (!fd)
	+ return 0;
	+
	+ *num_cores = 0;
	+
	+ while (fgets(line, 128, fd)) {
	+ int value;
	+
	+ /* if this is information on new processor */
	+ if (!strncmp(line, "processor", 9)) {
	+ ++*num_cores;
	+
	+ cur_cpu_family = -1;
	+ cur_cpu_model = -1;
	+ } else if (!strncmp(line, "cpu family", 10)) {
	+ if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
	+ cur_cpu_family = value;
	+ } else if (!strncmp(line, "model", 5)) {
	+ if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
	+ cur_cpu_model = value;
	+ }
	+
	+ /* if this is a Sandy Bridge CPU */
	+ if ((cur_cpu_family == 6) &&
	+ (cur_cpu_model == 0x2A \|\| cur_cpu_model == 0x2D))
	+ rc = 1;
	+ }
	+
	+ fclose(fd);
	+ return rc;
	+}
	+
	+static void mlx4_check_numa_enabled(struct ibv_context *context)
	+{
	+ char fname[MAXPATHLEN];
	+ char buf[128];
	+ FILE *fp;
	+ int numa_enabled;
	+ char env[VERBS_MAX_ENV_VAL];
	+
	+ snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/numa_node",
	+ ibv_get_device_name(context->device));
	+
	+ fp = fopen(fname, "r");
	+ if (!fp) {
	+ fprintf(stderr, PFX "Warning: can not check if NUMA is enabled "
	+ "on node: failed to open %s\n", fname);
	+ return;
	+ }
	+
	+ if (!fgets(buf, sizeof(buf), fp)) {
	+ fprintf(stderr, PFX "Warning: can not check if NUMA is enabled "
	+ "on node: failed to read numa node value\n");
	+ goto out;
	+ }
	+
	+ numa_enabled = (strtol(buf, 0, 10) >= 0);
	+ if (numa_enabled)
	+ printf(PFX "Device NUMA node detection is supported\n");
	+ else if (ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env, sizeof(env)))
	+ printf(PFX "Warning: Device NUMA node detection is not supported. "
	+ "Please consider setting the environment variable "
	+ "'MLX4_LOCAL_CPUS' or enable ACPI SLIT\n");
	+out:
	+ fclose(fp);
	+}
	+
	+static void dump_cpu_set(cpuset_t *cpu_set)
	+{
	+ int i;
	+ int first_cpu = -1;
	+ int last_cpu = -1;
	+ int n = 0;
	+
	+ for (i = 0; i < CPU_SETSIZE; i++) {
	+ if (CPU_ISSET(i, cpu_set)) {
	+ if (first_cpu < 0)
	+ first_cpu = i;
	+ if (i == CPU_SETSIZE - 1)
	+ last_cpu = i;
	+ } else if (first_cpu >= 0)
	+ last_cpu = i - 1;
	+
	+ if (last_cpu >= 0) {
	+ if (first_cpu != last_cpu)
	+ printf("%s%d-%d", n ? "," : "", first_cpu,
	+ last_cpu);
	+ else
	+ printf("%s%d", n ? "," : "", last_cpu);
	+
	+ first_cpu = -1;
	+ last_cpu = -1;
	+ ++n;
	+ }
	+ }
	+}
	+
	+/*
	+man cpuset
	+
	+ This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
	+ are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
	+ words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
	+ within a word are also in big-endian order.
	+
	+ The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
	+ the size of the bitmask.
	+
	+ Examples of the Mask Format:
	+
	+ 00000001 # just bit 0 set
	+ 40000000,00000000,00000000 # just bit 94 set
	+ 000000ff,00000000 # bits 32-39 set
	+ 00000000,000E3862 # 1,5,6,11-13,17-19 set
	+
	+ A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
	+
	+ 00000001,00000001,00010117
	+
	+ The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
	+ bit 4, and the "7" is for bits 2, 1, and 0.
	+*/
	+static void mlx4_local_cpu_set(struct ibv_context context, cpuset_t cpu_set)
	+{
	+ char *p, buf[1024];
	+ char env_value[VERBS_MAX_ENV_VAL];
	+ uint32_t word;
	+ int i, k;
	+
	+ if (mlx4_trace)
	+ mlx4_check_numa_enabled(context);
	+
	+ if (!ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env_value, sizeof(env_value))) {
	+ strncpy(buf, env_value, sizeof(buf));
	+ if (mlx4_trace)
	+ printf(PFX "Local CPUs flags were override by %s\n", buf);
	+ } else {
	+ char fname[MAXPATHLEN];
	+ FILE *fp;
	+
	+ snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/local_cpus",
	+ ibv_get_device_name(context->device));
	+
	+ fp = fopen(fname, "r");
	+ if (!fp) {
	+ fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
	+ return;
	+ }
	+ if (!fgets(buf, sizeof(buf), fp)) {
	+ fprintf(stderr, PFX "Warning: can not get local cpu set: failed to read cpu mask\n");
	+ fclose(fp);
	+ return;
	+ }
	+ fclose(fp);
	+ }

	- context->ibv_ctx.cmd_fd = cmd_fd;
	+ p = strrchr(buf, ',');
	+ if (!p)
	+ p = buf;

	- if (to_mdev(ibdev)->driver_abi_ver > 3) {
	- if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
	- &resp.ibv_resp, sizeof resp))
	- goto err_free;
	+ i = 0;
	+ do {
	+ if (*p == ',') {
	+ *p = 0;
	+ p ++;
	+ }
	+
	+ word = strtoul(p, 0, 16);
	+
	+ for (k = 0; word; ++k, word >>= 1)
	+ if (word & 1)
	+ CPU_SET(k+i, cpu_set);
	+
	+ if (p == buf)
	+ break;
	+
	+ p = strrchr(buf, ',');
	+ if (!p)
	+ p = buf;
	+
	+ i += 32;
	+ } while (i < CPU_SETSIZE);
	+}
	+
	+static int mlx4_enable_sandy_bridge_fix(struct ibv_context *context)
	+{
	+ cpuset_t my_cpus, dev_local_cpus, result_set;
	+ int stall_enable;
	+ int ret;
	+ int num_cores;
	+
	+ if (!mlx4_is_sandy_bridge(&num_cores))
	+ return 0;
	+
	+ /* by default disable stall on sandy bridge arch */
	+ stall_enable = 0;
	+
	+ /*
	+ * check if app is bound to cpu set that is inside
	+ * of device local cpu set. Disable stalling if true
	+ */
	+
	+ /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
	+ CPU_ZERO(&my_cpus);
	+ CPU_ZERO(&dev_local_cpus);
	+ CPU_ZERO(&result_set);
	+ ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
	+ sizeof(my_cpus), &my_cpus);
	+ if (ret == -1) {
	+ if (errno == EINVAL)
	+ fprintf(stderr, PFX "Warning: my cpu set is too small\n");
	+ else
	+ fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
	+ goto out;
	+ }
	+
	+ if (mlx4_trace) {
	+ printf(PFX "Running on cpus: ");
	+ dump_cpu_set(&my_cpus);
	+ printf("\n");
	+ }
	+
	+ /* get device local cpu set */
	+ mlx4_local_cpu_set(context, &dev_local_cpus);
	+
	+ /* make sure result_set is not init to all 0 */
	+ CPU_SET(0, &result_set);
	+ /* Set stall_enable if my cpu set and dev cpu set are disjoint sets */
	+ CPU_AND(&result_set, &my_cpus);
	+ CPU_AND(&result_set, &dev_local_cpus);
	+ stall_enable = CPU_COUNT(&result_set) ? 0 : 1;
	+
	+ if (mlx4_trace) {
	+ printf(PFX "HCA:%s local cpus: ", ibv_get_device_name(context->device));
	+ dump_cpu_set(&dev_local_cpus);
	+ printf("\n");
	+ if (CPU_COUNT(&my_cpus) == num_cores) {
	+ printf(PFX "Warning: CPU affinity wasn't used for this "
	+ "process, if the system has more than one numa node, it might be using a remote one.\n");
	+ printf(PFX " For achieving better performance, "
	+ "please consider setting the CPU "
	+ "affinity.\n");
	+ }
	+ }
	+
	+out:
	+ if (mlx4_trace)
	+ printf(PFX "Sandy Bridge CPU was detected, cq_stall is %s\n",
	+ stall_enable ? "enabled" : "disabled");
	+
	+ return stall_enable;
	+}
	+
	+static void mlx4_read_env(struct ibv_device ibdev, struct mlx4_context ctx)
	+{
	+ char env_value[VERBS_MAX_ENV_VAL];
	+
	+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_TRACE", env_value, sizeof(env_value)) &&
	+ (strcmp(env_value, "0")))
	+ mlx4_trace = 1;
	+
	+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_CQ_POLL", env_value, sizeof(env_value)) &&
	+ !strcmp(env_value, "0"))
	+ /* check if cq stall is overrided by user */
	+ ctx->stall_enable = 0;
	+ else
	+ /* autodetect if we need to do cq polling */
	+ ctx->stall_enable = mlx4_enable_sandy_bridge_fix(&ctx->ibv_ctx);
	+
	+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_NUM_LOOP", env_value, sizeof(env_value)))
	+ mlx4_stall_num_loop = atoi(env_value);
	+
	+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_SINGLE_THREADED", env_value, sizeof(env_value)))
	+ mlx4_single_threaded = strcmp(env_value, "1") ? 0 : 1;
	+
	+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx,
	+ "MLX4_USE_MUTEX",
	+ env_value,
	+ sizeof(env_value)))
	+ mlx4_use_mutex = strcmp(env_value, "1") ? 0 : 1;
	+}
	+
	+void read_init_vars(struct mlx4_context *ctx)
	+{
	+ char env_value[VERBS_MAX_ENV_VAL];
	+
	+ pthread_mutex_lock(&ctx->env_mtx);
	+ if (!ctx->env_initialized) {
	+ mlx4_read_env(ctx->ibv_ctx.device, ctx);
	+ if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_POST_SEND_PREFER_BF", env_value, sizeof(env_value))) {
	+ ctx->prefer_bf = !!strcmp(env_value, "0");
	+ if (mlx4_trace)
	+ printf(PFX "prefer_bf=%d\n", ctx->prefer_bf);
	+ } else {
	+ ctx->prefer_bf = 1;
	+ }

	- context->num_qps = resp.qp_tab_size;
	- context->num_xrc_srqs = resp.qp_tab_size;
	- bf_reg_size = resp.bf_reg_size;
	- context->cqe_size = resp.cqe_size;
	+ ctx->env_initialized = 1;
	+ }
	+ pthread_mutex_unlock(&ctx->env_mtx);
	+}
	+
	+static int mlx4_init_context(struct verbs_device *v_device,
	+ struct ibv_context *ibv_ctx, int cmd_fd)
	+{
	+ struct mlx4_context *context;
	+ struct mlx4_alloc_ucontext_req req;
	+ struct mlx4_alloc_ucontext_resp resp;
	+ struct mlx4_alloc_ucontext_resp_v3 resp_v3;
	+ int i;
	+ struct ibv_exp_device_attr dev_attrs;
	+ struct ibv_device_attr dev_legacy_attrs;
	+ struct mlx4_device *dev = to_mdev(&v_device->device);
	+ unsigned int qp_tab_size;
	+ unsigned int bf_reg_size;
	+ unsigned int cqe_size;
	+ int hca_clock_offset;
	+ void *hca_clock_page = NULL;
	+
	+ /* verbs_context should be used for new verbs.
	+ * memory footprint of mlx4_context and verbs_context share
	+ * struct ibv_context.
	+ */
	+ struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
	+ struct verbs_context_exp *verbs_exp_ctx = verbs_get_exp_ctx(ibv_ctx);
	+
	+ memset(&req, 0, sizeof(req));
	+ context = to_mctx(ibv_ctx);
	+ ibv_ctx->cmd_fd = cmd_fd;
	+ ibv_ctx->device = &v_device->device;
	+
	+ if (pthread_mutex_init(&context->env_mtx, NULL))
	+ return EIO;
	+
	+ if (dev->driver_abi_ver > 3) {
	+#ifdef MLX4_WQE_FORMAT
	+ req.lib_caps = MLX4_USER_DEV_CAP_WQE_FORMAT;
	+#endif
	+ if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req),
	+ &resp.ibv_resp, sizeof(resp)))
	+ return errno;
	+
	+ VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp));
	+ qp_tab_size = resp.qp_tab_size;
	+ bf_reg_size = resp.bf_reg_size;
	+ context->bf_regs_per_page = resp.bf_regs_per_page;
	+ cqe_size = resp.cqe_size;
	} else {
	- if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
	- &resp_v3.ibv_resp, sizeof resp_v3))
	- goto err_free;
	-
	- context->num_qps = resp_v3.qp_tab_size;
	- context->num_xrc_srqs = resp_v3.qp_tab_size;
	- bf_reg_size = resp_v3.bf_reg_size;
	- context->cqe_size = 32;
	+ if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req.cmd),
	+ &resp_v3.ibv_resp, sizeof(resp_v3)))
	+ return errno;
	+
	+ VALGRIND_MAKE_MEM_DEFINED(&resp_v3, sizeof(resp_v3));
	+ qp_tab_size = resp_v3.qp_tab_size;
	+ bf_reg_size = resp_v3.bf_reg_size;
	+ context->bf_regs_per_page = resp_v3.bf_regs_per_page;
	+ cqe_size = 32;
	}

	+ context->num_qps = qp_tab_size;
	context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS;
	context->qp_table_mask = (1 << context->qp_table_shift) - 1;
	+ context->cqe_size = cqe_size;
	+ for (i = 0; i < MLX4_PORTS_NUM; ++i)
	+ context->port_query_cache[i].valid = 0;

	pthread_mutex_init(&context->qp_table_mutex, NULL);
	for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
	context->qp_table[i].refcnt = 0;

	- context->xrc_srq_table_shift = ffs(context->num_xrc_srqs) - 1
	- - MLX4_XRC_SRQ_TABLE_BITS;
	- context->xrc_srq_table_mask = (1 << context->xrc_srq_table_shift) - 1;
	-
	- pthread_mutex_init(&context->xrc_srq_table_mutex, NULL);
	- for (i = 0; i < MLX4_XRC_SRQ_TABLE_SIZE; ++i)
	- context->xrc_srq_table[i].refcnt = 0;
	-
	for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
	context->db_list[i] = NULL;

	+ mlx4_init_xsrq_table(&context->xsrq_table, qp_tab_size);
	pthread_mutex_init(&context->db_list_mutex, NULL);

	- context->uar = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE,
	+ context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
	MAP_SHARED, cmd_fd, 0);
	if (context->uar == MAP_FAILED)
	- goto err_free;
	+ return errno;

	if (bf_reg_size) {
	- context->bf_page = mmap(NULL, to_mdev(ibdev)->page_size,
	- PROT_WRITE, MAP_SHARED, cmd_fd,
	- to_mdev(ibdev)->page_size);
	- if (context->bf_page == MAP_FAILED) {
	+ context->bfs.page = mmap(NULL, dev->page_size,
	+ PROT_WRITE, MAP_SHARED, cmd_fd,
	+ dev->page_size);
	+ if (context->bfs.page == MAP_FAILED) {
	fprintf(stderr, PFX "Warning: BlueFlame available, "
	"but failed to mmap() BlueFlame page.\n");
	- context->bf_page = NULL;
	- context->bf_buf_size = 0;
	+ context->bfs.page = NULL;
	+ context->bfs.buf_size = 0;
	+ context->bfs.num_dedic_bfs = 0;
	} else {
	- context->bf_buf_size = bf_reg_size / 2;
	- context->bf_offset = 0;
	- pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
	+ context->bfs.num_dedic_bfs = min(context->bf_regs_per_page - 1,
	+ MLX4_MAX_BFS_IN_PAGE - 1);
	+ context->bfs.buf_size = bf_reg_size / 2;
	+ mlx4_spinlock_init(&context->bfs.dedic_bf_lock, !mlx4_single_threaded);
	+ context->bfs.cmn_bf.address = context->bfs.page;
	+
	+ mlx4_lock_init(&context->bfs.cmn_bf.lock,
	+ !mlx4_single_threaded,
	+ mlx4_get_locktype());
	+
	+ context->bfs.dedic_bf_free = context->bfs.num_dedic_bfs;
	+ for (i = 0; i < context->bfs.num_dedic_bfs; i++) {
	+ context->bfs.dedic_bf[i].address = context->bfs.page + (i + 1) * MLX4_BFS_STRIDE;
	+ context->bfs.dedic_bf_used[i] = 0;
	+ }
	}
	} else {
	- context->bf_page = NULL;
	- context->bf_buf_size = 0;
	+ context->bfs.page = NULL;
	+ context->bfs.buf_size = 0;
	+ context->bfs.num_dedic_bfs = 0;
	}

	- pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
	+ mlx4_spinlock_init(&context->uar_lock, !mlx4_single_threaded);

	- context->ibv_ctx.ops = mlx4_ctx_ops;
	-#ifdef HAVE_IBV_XRC_OPS
	- context->ibv_ctx.more_ops = &mlx4_more_ops;
	-#endif
	+ mlx4_spinlock_init(&context->send_db_lock, !mlx4_single_threaded);
	+ INIT_LIST_HEAD(&context->send_db_list);
	+
	+ mlx4_spinlock_init(&context->hugetlb_lock, !mlx4_single_threaded);
	+ INIT_LIST_HEAD(&context->hugetlb_list);

	- if (mlx4_query_device(&context->ibv_ctx, &dev_attrs))
	- goto query_free;
	+ pthread_mutex_init(&context->task_mutex, NULL);
	+
	+ memset(&dev_attrs, 0, sizeof(dev_attrs));
	+ dev_attrs.comp_mask = IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK \|
	+ IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK \|
	+ IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS \|
	+ IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN;
	+
	+ if (mlx4_exp_query_device(ibv_ctx, &dev_attrs)) {
	+ if (mlx4_query_device(ibv_ctx, &dev_legacy_attrs))
	+ goto query_free;
	+
	+ memcpy(&dev_attrs, &dev_legacy_attrs, sizeof(dev_legacy_attrs));
	+ }

	context->max_qp_wr = dev_attrs.max_qp_wr;
	context->max_sge = dev_attrs.max_sge;
	context->max_cqe = dev_attrs.max_cqe;
	- if (!(dev_attrs.device_cap_flags & IBV_DEVICE_XRC)) {
	- fprintf(stderr, PFX "There is a mismatch between "
	- "the kernel and the userspace libraries: "
	- "Kernel does not support XRC. Exiting.\n");
	- goto query_free;
	+ context->exp_device_cap_flags = dev_attrs.exp_device_cap_flags;
	+ if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN)
	+ context->max_ctx_res_domain = dev_attrs.max_ctx_res_domain;
	+
	+ VALGRIND_MAKE_MEM_DEFINED(&context->hca_core_clock, sizeof(context->hca_core_clock));
	+ if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) {
	+ if (dev_attrs.hca_core_clock)
	+ context->core_clk.mult = ((1ull * 1000) << 29) /
	+ dev_attrs.hca_core_clock;
	+ else
	+ context->core_clk.mult = 0;
	+
	+ context->core_clk.shift = 29;
	+ context->core_clk.mask = dev_attrs.timestamp_mask;
	+
	+ if (ioctl(cmd_fd, MLX4_IOCHWCLOCKOFFSET,
	+ &hca_clock_offset) >= 0) {
	+ VALGRIND_MAKE_MEM_DEFINED(&hca_clock_offset, sizeof(hca_clock_offset));
	+ context->core_clk.offset = hca_clock_offset;
	+ hca_clock_page = mmap(NULL, hca_clock_offset +
	+ sizeof(context->core_clk.mask),
	+ PROT_READ, MAP_SHARED, cmd_fd,
	+ dev->page_size *
	+ (MLX4_IB_MMAP_GET_HW_CLOCK));
	+
	+ if (hca_clock_page == MAP_FAILED) {
	+ fprintf(stderr, PFX
	+ "Warning: Timestamp available,\n"
	+ "but failed to mmap() hca core "
	+ "clock page.\n");
	+ } else {
	+ context->hca_core_clock = hca_clock_page +
	+ context->core_clk.offset;
	+ }
	+ }
	}

	- return &context->ibv_ctx;
	+ ibv_ctx->ops = mlx4_ctx_ops;
	+
	+ verbs_ctx->has_comp_mask \|= VERBS_CONTEXT_XRCD \| VERBS_CONTEXT_SRQ \|
	+ VERBS_CONTEXT_QP;
	+
	+ verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd);
	+ verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd);
	+ verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex);
	+ verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num);
	+ verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex);
	+ verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp);
	+ verbs_set_ctx_op(verbs_ctx, create_flow, ibv_cmd_create_flow);
	+ verbs_set_ctx_op(verbs_ctx, destroy_flow, ibv_cmd_destroy_flow);
	+
	+ /*
	+ * Set experimental verbs
	+ */
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_reg_shared_mr, mlx4_reg_shared_mr);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_flow, ibv_exp_cmd_create_flow);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_destroy_flow, ibv_exp_cmd_destroy_flow);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_ah, mlx4_exp_create_ah);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_device, mlx4_exp_query_device);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_create_qp, mlx4_exp_create_qp);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_qp, mlx4_exp_modify_qp);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_port, mlx4_exp_query_port);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_cq, mlx4_modify_cq);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_task, mlx4_post_task);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_set_legacy_xrc, mlx4_set_legacy_xrc);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_get_legacy_xrc, mlx4_get_legacy_xrc);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx4_exp_poll_cq);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_cq, mlx4_create_cq_ex);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_values, mlx4_query_values);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_reg_mr, mlx4_exp_reg_mr);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_send, mlx4_exp_post_send);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_bind_mw, mlx4_exp_bind_mw);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_rereg_mr, mlx4_exp_rereg_mr);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_dereg_mr, mlx4_exp_dereg_mr);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_res_domain, mlx4_exp_create_res_domain);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_res_domain, mlx4_exp_destroy_res_domain);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_query_intf, mlx4_exp_query_intf);
	+ verbs_set_exp_ctx_op(verbs_exp_ctx, exp_release_intf, mlx4_exp_release_intf);
	+
	+ return 0;

	query_free:
	- munmap(context->uar, to_mdev(ibdev)->page_size);
	- if (context->bf_page)
	- munmap(context->bf_page, to_mdev(ibdev)->page_size);
	-
	-err_free:
	- free(context);
	- return NULL;
	+ munmap(context->uar, dev->page_size);
	+ if (context->bfs.page)
	+ munmap(context->bfs.page, dev->page_size);
	+ if (hca_clock_page)
	+ munmap(hca_clock_page, hca_clock_offset +
	+ sizeof(context->core_clk.mask));
	+
	+ return errno;
	}

	-static void mlx4_free_context(struct ibv_context *ibctx)
	+static void mlx4_uninit_context(struct verbs_device *v_device,
	+ struct ibv_context *ibv_ctx)
	{
	- struct mlx4_context *context = to_mctx(ibctx);
	-
	- munmap(context->uar, to_mdev(ibctx->device)->page_size);
	- if (context->bf_page)
	- munmap(context->bf_page, to_mdev(ibctx->device)->page_size);
	- free(context);
	+ struct mlx4_context *context = to_mctx(ibv_ctx);
	+
	+ munmap(context->uar, to_mdev(&v_device->device)->page_size);
	+ if (context->bfs.page)
	+ munmap(context->bfs.page,
	+ to_mdev(&v_device->device)->page_size);
	+ if (context->hca_core_clock)
	+ munmap((context->hca_core_clock - context->core_clk.offset),
	+ context->core_clk.offset + sizeof(context->core_clk.mask));
	}

	-static struct ibv_device_ops mlx4_dev_ops = {
	- .alloc_context = mlx4_alloc_context,
	- .free_context = mlx4_free_context
	-};
	-
	-static struct ibv_device mlx4_driver_init(const char uverbs_sys_path,
	- int abi_version)
	+static struct verbs_device mlx4_driver_init(const char uverbs_sys_path,
	+ int abi_version)
	{
	char value[8];
	- struct mlx4_device *dev;
	+ struct mlx4_device *dev;
	unsigned vendor, device;
	int i;

	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
	value, sizeof value) < 0)
	return NULL;
	- sscanf(value, "%i", &vendor);
	+ vendor = strtol(value, NULL, 16);

	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
	value, sizeof value) < 0)
	return NULL;
	- sscanf(value, "%i", &device);
	+ device = strtol(value, NULL, 16);

	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
	if (vendor == hca_table[i].vendor &&
	@@ -300,24 +737,32 @@
	return NULL;
	}

	- dev = malloc(sizeof *dev);
	+ dev = calloc(1, sizeof(*dev));
	if (!dev) {
	fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
	uverbs_sys_path);
	return NULL;
	}

	- dev->ibv_dev.ops = mlx4_dev_ops;
	dev->page_size = sysconf(_SC_PAGESIZE);
	+
	+ dev->devid.id = device;
	dev->driver_abi_ver = abi_version;

	- return &dev->ibv_dev;
	+ dev->verbs_dev.sz = sizeof(*dev);
	+ dev->verbs_dev.size_of_context =
	+ sizeof(struct mlx4_context) - sizeof(struct ibv_context);
	+ /* mlx4_init_context will initialize provider calls */
	+ dev->verbs_dev.init_context = mlx4_init_context;
	+ dev->verbs_dev.uninit_context = mlx4_uninit_context;
	+
	+ return &dev->verbs_dev;
	}

	#ifdef HAVE_IBV_REGISTER_DRIVER
	static __attribute__((constructor)) void mlx4_register_driver(void)
	{
	- ibv_register_driver("mlx4", mlx4_driver_init);
	+ verbs_register_driver("mlx4", mlx4_driver_init);
	}
	#else
	/*
	Index: contrib/ofed/libmlx4/src/mlx4_exp.h
	===================================================================
	--- /dev/null
	+++ contrib/ofed/libmlx4/src/mlx4_exp.h
	@@ -0,0 +1,79 @@
	+/*
	+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
	+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
	+ *
	+ * This software is available to you under a choice of one of two
	+ * licenses. You may choose to be licensed under the terms of the GNU
	+ * General Public License (GPL) Version 2, available from the file
	+ * COPYING in the main directory of this source tree, or the
	+ * OpenIB.org BSD license below:
	+ *
	+ * Redistribution and use in source and binary forms, with or
	+ * without modification, are permitted provided that the following
	+ * conditions are met:
	+ *
	+ * - Redistributions of source code must retain the above
	+ * copyright notice, this list of conditions and the following
	+ * disclaimer.
	+ *
	+ * - Redistributions in binary form must reproduce the above
	+ * copyright notice, this list of conditions and the following
	+ * disclaimer in the documentation and/or other materials
	+ * provided with the distribution.
	+ *
	+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	+ * SOFTWARE.
	+ */
	+
	+#ifndef MLX4_EXP_H
	+#define MLX4_EXP_H
	+
	+#include <infiniband/kern-abi_exp.h>
	+#include "mlx4.h"
	+
	+/*
	+ * mlx4-abi experimental structs
	+ */
	+struct mlx4_exp_create_qp {
	+ struct ibv_exp_create_qp ibv_cmd;
	+ struct mlx4_exp_create_qp_provider exp_cmd;
	+};
	+
	+struct mlx4_exp_create_cq {
	+ struct ibv_exp_create_cq ibv_cmd;
	+ __u64 buf_addr;
	+ __u64 db_addr;
	+};
	+
	+/*
	+ * Experimental functions
	+ */
	+struct ibv_qp mlx4_exp_create_qp(struct ibv_context context,
	+ struct ibv_exp_qp_init_attr *attr);
	+int mlx4_exp_query_device(struct ibv_context *context,
	+ struct ibv_exp_device_attr *attr);
	+int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num,
	+ struct ibv_exp_port_attr *port_attr);
	+int mlx4_exp_modify_cq(struct ibv_cq cq, struct ibv_exp_cq_attr attr,
	+ int attr_mask);
	+int mlx4_exp_rereg_mr(struct ibv_mr mr, int flags, struct ibv_pd pd,
	+ void *addr, size_t length, uint64_t access,
	+ struct ibv_exp_rereg_mr_attr attr, struct ibv_exp_rereg_out out);
	+int mlx4_exp_dereg_mr(struct ibv_mr mr, struct ibv_exp_dereg_out out);
	+struct ibv_exp_res_domain mlx4_exp_create_res_domain(struct ibv_context context,
	+ struct ibv_exp_res_domain_init_attr *attr);
	+int mlx4_exp_destroy_res_domain(struct ibv_context *context,
	+ struct ibv_exp_res_domain *res_dom,
	+ struct ibv_exp_destroy_res_domain_attr *attr);
	+void mlx4_exp_query_intf(struct ibv_context context, struct ibv_exp_query_intf_params *params,
	+ enum ibv_exp_query_intf_status *status);
	+int mlx4_exp_release_intf(struct ibv_context context, void intf,
	+ struct ibv_exp_release_intf_params *params);
	+
	+#endif /* MLX4_EXP_H */
	Index: contrib/ofed/libmlx4/src/qp.c
	===================================================================
	--- contrib/ofed/libmlx4/src/qp.c
	+++ contrib/ofed/libmlx4/src/qp.c
	@@ -40,11 +40,40 @@
	#include <netinet/in.h>
	#include <pthread.h>
	#include <string.h>
	+#include <errno.h>

	#include "mlx4.h"
	#include "doorbell.h"
	#include "wqe.h"

	+#ifndef htobe64
	+#include <endian.h>
	+# if __BYTE_ORDER == __LITTLE_ENDIAN
	+# define htobe64(x) __bswap_64 (x)
	+# else
	+# define htobe64(x) (x)
	+# endif
	+#endif
	+
	+#ifdef MLX4_WQE_FORMAT
	+ #define SET_BYTE_COUNT(byte_count) (htonl(byte_count) \| owner_bit)
	+ #define WQE_CTRL_OWN (1 << 30)
	+#else
	+ #define SET_BYTE_COUNT(byte_count) htonl(byte_count)
	+ #define WQE_CTRL_OWN (1 << 31)
	+#endif
	+enum {
	+ MLX4_OPCODE_BASIC = 0x00010000,
	+ MLX4_OPCODE_MANAGED = 0x00020000,
	+
	+ MLX4_OPCODE_WITH_IMM = 0x01000000
	+};
	+
	+#define MLX4_IB_OPCODE(op, class, attr) (((class) & 0x00FF0000) \| ((attr) & 0xFF000000) \| ((op) & 0x0000FFFF))
	+#define MLX4_IB_OPCODE_GET_CLASS(opcode) ((opcode) & 0x00FF0000)
	+#define MLX4_IB_OPCODE_GET_OP(opcode) ((opcode) & 0x0000FFFF)
	+#define MLX4_IB_OPCODE_GET_ATTR(opcode) ((opcode) & 0xFF000000)
	+
	static const uint32_t mlx4_ib_opcode[] = {
	[IBV_WR_SEND] = MLX4_OPCODE_SEND,
	[IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM,
	@@ -55,14 +84,151 @@
	[IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA,
	};

	-static void get_recv_wqe(struct mlx4_qp qp, int n)
	+
	+static const uint32_t mlx4_ib_opcode_exp[] = {
	+ [IBV_EXP_WR_SEND] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_SEND_WITH_IMM] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_IMM, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
	+ [IBV_EXP_WR_RDMA_WRITE] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_RDMA_WRITE_WITH_IMM] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE_IMM, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
	+ [IBV_EXP_WR_RDMA_READ] = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_READ, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_ATOMIC_CMP_AND_SWP] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_CS, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_ATOMIC_FETCH_AND_ADD] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_FA, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_CS, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_FA, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_LOCAL_INV] = MLX4_IB_OPCODE(MLX4_OPCODE_LOCAL_INVAL, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_SEND_WITH_INV] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_INVAL, MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
	+ [IBV_EXP_WR_BIND_MW] = MLX4_IB_OPCODE(MLX4_OPCODE_BIND_MW, MLX4_OPCODE_BASIC, 0),
	+ [IBV_EXP_WR_SEND_ENABLE] = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_ENABLE, MLX4_OPCODE_MANAGED, 0),
	+ [IBV_EXP_WR_RECV_ENABLE] = MLX4_IB_OPCODE(MLX4_OPCODE_RECV_ENABLE, MLX4_OPCODE_MANAGED, 0),
	+ [IBV_EXP_WR_CQE_WAIT] = MLX4_IB_OPCODE(MLX4_OPCODE_CQE_WAIT, MLX4_OPCODE_MANAGED, 0),
	+};
	+
	+enum {
	+ MLX4_CALC_FLOAT64_ADD = 0x00,
	+ MLX4_CALC_UINT64_ADD = 0x01,
	+ MLX4_CALC_UINT64_MAXLOC = 0x02,
	+ MLX4_CALC_UINT64_AND = 0x03,
	+ MLX4_CALC_UINT64_XOR = 0x04,
	+ MLX4_CALC_UINT64_OR = 0x05
	+};
	+
	+enum {
	+ MLX4_WQE_CTRL_CALC_OP = 26
	+};
	+
	+static const struct mlx4_calc_op {
	+ int valid;
	+ uint32_t opcode;
	+} mlx4_calc_ops_table
	+ [IBV_EXP_CALC_DATA_SIZE_NUMBER]
	+ [IBV_EXP_CALC_OP_NUMBER]
	+ [IBV_EXP_CALC_DATA_TYPE_NUMBER] = {
	+ [IBV_EXP_CALC_DATA_SIZE_64_BIT] = {
	+ [IBV_EXP_CALC_OP_ADD] = {
	+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_FLOAT64_ADD << MLX4_WQE_CTRL_CALC_OP }
	+ },
	+ [IBV_EXP_CALC_OP_BXOR] = {
	+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP }
	+ },
	+ [IBV_EXP_CALC_OP_BAND] = {
	+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP }
	+ },
	+ [IBV_EXP_CALC_OP_BOR] = {
	+ [IBV_EXP_CALC_DATA_TYPE_INT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP },
	+ [IBV_EXP_CALC_DATA_TYPE_FLOAT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP }
	+ },
	+ [IBV_EXP_CALC_OP_MAXLOC] = {
	+ [IBV_EXP_CALC_DATA_TYPE_UINT] = {
	+ .valid = 1,
	+ .opcode = MLX4_CALC_UINT64_MAXLOC << MLX4_WQE_CTRL_CALC_OP }
	+ }
	+ }
	+};
	+
	+static int post_send_other(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
	+static int post_send_rc_raw_packet(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
	+static int post_send_ud(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
	+static int post_send_rc_uc(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
	+static int post_send_xrc(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
	+
	+#define MLX4_WAIT_EN_VALID (1<<30)
	+
	+static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count) __attribute__((always_inline));
	+static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count)
	{
	- return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
	+ struct mlx4_wqe_wait_en_seg seg = (struct mlx4_wqe_wait_en_seg )wqe_seg;
	+
	+ seg->valid = htonl(MLX4_WAIT_EN_VALID);
	+ seg->pi = htonl(count);
	+ seg->obj_num = htonl(obj_num);
	+
	+ return;
	}

	-static void get_send_wqe(struct mlx4_qp qp, int n)
	+static inline void get_recv_wqe(struct mlx4_qp qp, int n) __attribute__((always_inline));
	+static inline void get_recv_wqe(struct mlx4_qp qp, int n)
	+{
	+ return qp->rq.buf + (n << qp->rq.wqe_shift);
	+}
	+
	+void mlx4_get_recv_wqe(struct mlx4_qp qp, int n)
	+{
	+ return get_recv_wqe(qp, n);
	+}
	+
	+static void get_send_wqe64(struct mlx4_qp qp, unsigned int n)
	+{
	+ return qp->sq.buf + (n << 6);
	+}
	+static void get_send_wqe(struct mlx4_qp qp, unsigned int n)
	{
	- return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
	+ return qp->sq.buf + (n << qp->sq.wqe_shift);
	}

	/*
	@@ -70,7 +236,48 @@
	* first four bytes of every 64 byte chunk with 0xffffffff, except for
	* the very first chunk of the WQE.
	*/
	-static void stamp_send_wqe(struct mlx4_qp *qp, int n)
	+void mlx4_init_qp_indices(struct mlx4_qp *qp)
	+{
	+ qp->sq.head = 0;
	+ qp->sq.tail = 0;
	+ qp->rq.head = 0;
	+ qp->rq.tail = 0;
	+ qp->sq.head_en_index = 0;
	+ qp->sq.head_en_count = 0;
	+ qp->rq.head_en_index = 0;
	+ qp->rq.head_en_count = 0;
	+}
	+
	+#ifdef MLX4_WQE_FORMAT
	+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
	+{
	+ __be32 *wqe = get_send_wqe(qp, 0);
	+ int wq_size = (qp->sq.wqe_cnt << qp->sq.wqe_shift);
	+ int i;
	+
	+ for (i = 0; i < wq_size; i += 64)
	+ wqe[i / 4] = htonl(WQE_CTRL_OWN);
	+}
	+
	+static void set_owner_wqe(struct mlx4_qp *qp, unsigned int idx, int ds,
	+ uint32_t owner_bit)
	+{
	+ uint32_t *wqe;
	+ int max_sz = (1 << qp->sq.wqe_shift) / 4;
	+ int cur_sz = ds * 4;
	+ int tail_sz;
	+ int i;
	+
	+ if (max_sz - cur_sz < 16)
	+ return;
	+
	+ wqe = get_send_wqe(qp, idx & (qp->sq.wqe_cnt - 1));
	+ tail_sz = max_sz - cur_sz;
	+ for (i = 0; tail_sz > 16; i += 4, tail_sz -= 16)
	+ wqe[cur_sz + i * 4] = owner_bit;
	+}
	+#else
	+static void stamp_send_wqe(struct mlx4_qp *qp, unsigned int n)
	{
	uint32_t *wqe = get_send_wqe(qp, n);
	int i;
	@@ -80,14 +287,6 @@
	wqe[i] = 0xffffffff;
	}

	-void mlx4_init_qp_indices(struct mlx4_qp *qp)
	-{
	- qp->sq.head = 0;
	- qp->sq.tail = 0;
	- qp->rq.head = 0;
	- qp->rq.tail = 0;
	-}
	-
	void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
	{
	struct mlx4_wqe_ctrl_seg *ctrl;
	@@ -95,29 +294,78 @@

	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
	ctrl = get_send_wqe(qp, i);
	- ctrl->owner_opcode = htonl(1 << 31);
	+ ctrl->owner_opcode = htonl(WQE_CTRL_OWN);
	ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);

	stamp_send_wqe(qp, i);
	}
	}
	+#endif

	-static int wq_overflow(struct mlx4_wq wq, int nreq, struct mlx4_cq cq)
	+static int __wq_overflow(struct mlx4_wq wq, int nreq, struct mlx4_qp qp) __attribute__((noinline));
	+static int __wq_overflow(struct mlx4_wq wq, int nreq, struct mlx4_qp qp)
	{
	+ struct mlx4_cq *cq = to_mcq(qp->verbs_qp.qp.send_cq);
	unsigned cur;

	+ mlx4_lock(&cq->lock);
	cur = wq->head - wq->tail;
	- if (cur + nreq < wq->max_post)
	- return 0;
	+ mlx4_unlock(&cq->lock);
	+
	+ return cur + nreq >= wq->max_post;
	+}
	+
	+static inline int wq_overflow(struct mlx4_wq wq, int nreq, struct mlx4_qp qp) __attribute__((always_inline));
	+static inline int wq_overflow(struct mlx4_wq wq, int nreq, struct mlx4_qp qp)
	+{
	+ unsigned cur;

	- pthread_spin_lock(&cq->lock);
	cur = wq->head - wq->tail;
	- pthread_spin_unlock(&cq->lock);
	+ if (likely(cur + nreq < wq->max_post))
	+ return 0;

	- return cur + nreq >= wq->max_post;
	+ return __wq_overflow(wq, nreq, qp);
	+}
	+
	+static void set_bind_seg(struct mlx4_wqe_bind_seg bseg, struct ibv_exp_send_wr wr)
	+{
	+ uint64_t acc = wr->bind_mw.bind_info.exp_mw_access_flags;
	+ bseg->flags1 = 0;
	+ if (acc & IBV_EXP_ACCESS_REMOTE_ATOMIC)
	+ bseg->flags1 \|= htonl(MLX4_WQE_MW_ATOMIC);
	+ if (acc & IBV_EXP_ACCESS_REMOTE_WRITE)
	+ bseg->flags1 \|= htonl(MLX4_WQE_MW_REMOTE_WRITE);
	+ if (acc & IBV_EXP_ACCESS_REMOTE_READ)
	+ bseg->flags1 \|= htonl(MLX4_WQE_MW_REMOTE_READ);
	+
	+ bseg->flags2 = 0;
	+ if (((struct verbs_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
	+ bseg->flags2 \|= htonl(MLX4_WQE_BIND_TYPE_2);
	+ if (acc & IBV_EXP_ACCESS_MW_ZERO_BASED)
	+ bseg->flags2 \|= htonl(MLX4_WQE_BIND_ZERO_BASED);
	+
	+ bseg->new_rkey = htonl(wr->bind_mw.rkey);
	+ bseg->lkey = htonl(wr->bind_mw.bind_info.mr->lkey);
	+ bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
	+ bseg->length = htobe64(wr->bind_mw.bind_info.length);
	+}
	+
	+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
	+ uint32_t rkey) __attribute__((always_inline));
	+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
	+ uint32_t rkey)
	+{
	+ iseg->mem_key = htonl(rkey);
	+
	+ iseg->reserved1 = 0;
	+ iseg->reserved2 = 0;
	+ iseg->reserved3[0] = 0;
	+ iseg->reserved3[1] = 0;
	}

	static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
	+ uint64_t remote_addr, uint32_t rkey) __attribute__((always_inline));
	+static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
	uint64_t remote_addr, uint32_t rkey)
	{
	rseg->raddr = htonll(remote_addr);
	@@ -125,16 +373,33 @@
	rseg->reserved = 0;
	}

	-static void set_atomic_seg(struct mlx4_wqe_atomic_seg aseg, struct ibv_send_wr wr)
	+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
	+ struct ibv_exp_send_wr *wr)
	{
	- if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
	+ struct ibv_exp_fetch_add *fa;
	+
	+ if (wr->exp_opcode == IBV_EXP_WR_ATOMIC_CMP_AND_SWP) {
	aseg->swap_add = htonll(wr->wr.atomic.swap);
	aseg->compare = htonll(wr->wr.atomic.compare_add);
	+ } else if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) {
	+ fa = &wr->ext_op.masked_atomics.wr_data.inline_data.op.fetch_add;
	+ aseg->swap_add = htonll(fa->add_val);
	+ aseg->compare = htonll(fa->field_boundary);
	} else {
	aseg->swap_add = htonll(wr->wr.atomic.compare_add);
	aseg->compare = 0;
	}
	+}
	+
	+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
	+ struct ibv_exp_send_wr *wr)
	+{
	+ struct ibv_exp_cmp_swap *cs = &wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap;

	+ aseg->swap_data = htonll(cs->swap_val);
	+ aseg->cmp_data = htonll(cs->compare_val);
	+ aseg->swap_mask = htonll(cs->swap_mask);
	+ aseg->cmp_mask = htonll(cs->compare_mask);
	}

	static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
	@@ -147,14 +412,18 @@
	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
	}

	-static void __set_data_seg(struct mlx4_wqe_data_seg dseg, struct ibv_sge sg)
	+static inline void __set_data_seg(struct mlx4_wqe_data_seg dseg, struct ibv_sge sg) __attribute__((always_inline));
	+static inline void __set_data_seg(struct mlx4_wqe_data_seg dseg, struct ibv_sge sg)
	{
	dseg->byte_count = htonl(sg->length);
	dseg->lkey = htonl(sg->lkey);
	dseg->addr = htonll(sg->addr);
	}

	-static void set_data_seg(struct mlx4_wqe_data_seg dseg, struct ibv_sge sg)
	+static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg,
	+ struct ibv_sge *sg, unsigned int owner_bit) __attribute__((always_inline));
	+static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg,
	+ struct ibv_sge *sg, unsigned int owner_bit)
	{
	dseg->lkey = htonl(sg->lkey);
	dseg->addr = htonll(sg->addr);
	@@ -169,7 +438,10 @@
	*/
	wmb();

	- dseg->byte_count = htonl(sg->length);
	+ if (likely(sg->length))
	+ dseg->byte_count = SET_BYTE_COUNT(sg->length);
	+ else
	+ dseg->byte_count = htonl(0x80000000);
	}

	/*
	@@ -177,84 +449,787 @@
	* implementations may use move-string-buffer assembler instructions,
	* which do not guarantee order of copying.
	*/
	-static void mlx4_bf_copy(unsigned long dst, unsigned long src, unsigned bytecnt)
	+#if defined(__amd64__)
	+#define COPY_64B_WC(dst, src) \
	+ __asm__ __volatile__ ( \
	+ " movdqa (%1),%%xmm0\n" \
	+ " movdqa 16(%1),%%xmm1\n" \
	+ " movdqa 32(%1),%%xmm2\n" \
	+ " movdqa 48(%1),%%xmm3\n" \
	+ " movntdq %%xmm0, (%0)\n" \
	+ " movntdq %%xmm1, 16(%0)\n" \
	+ " movntdq %%xmm2, 32(%0)\n" \
	+ " movntdq %%xmm3, 48(%0)\n" \
	+ : : "r" (dst), "r" (src) : "memory"); \
	+ dst += 8; \
	+ src += 8
	+#else
	+#define COPY_64B_WC(dst, src) \
	+ dst++ = src++; \
	+ dst++ = src++; \
	+ dst++ = src++; \
	+ dst++ = src++; \
	+ dst++ = src++; \
	+ dst++ = src++; \
	+ dst++ = src++; \
	+ dst++ = src++
	+#endif
	+
	+static void mlx4_bf_copy(uint64_t dst, uint64_t src, unsigned bytecnt)
	{
	while (bytecnt > 0) {
	- dst++ = src++;
	- dst++ = src++;
	- bytecnt -= 2 * sizeof (long);
	+ COPY_64B_WC(dst, src);
	+ bytecnt -= 8 * sizeof(uint64_t);
	+ }
	+}
	+
	+/* Convert WQE format to fit BF usage */
	+static inline void convert_to_bf_wqe(struct mlx4_qp *qp,
	+ struct mlx4_wqe_ctrl_seg *ctrl,
	+ const unsigned wqe_idx) __attribute__((always_inline));
	+static inline void convert_to_bf_wqe(struct mlx4_qp *qp,
	+ struct mlx4_wqe_ctrl_seg *ctrl,
	+ const unsigned wqe_idx)
	+{
	+ uint32_t tmp = (uint32_t )ctrl->reserved;
	+
	+ ctrl->owner_opcode \|= htonl((wqe_idx & 0xffff) << 8);
	+ *tmp \|= qp->doorbell_qpn;
	+}
	+
	+static inline void copy_wqe_to_bf(struct mlx4_qp *qp,
	+ struct mlx4_wqe_ctrl_seg *ctrl,
	+ const int aligned_size,
	+ const unsigned wqe_idx,
	+ const int dedic_bf,
	+ const int one_thread_auto_evict) __attribute__((always_inline));
	+static inline void copy_wqe_to_bf(struct mlx4_qp *qp,
	+ struct mlx4_wqe_ctrl_seg *ctrl,
	+ const int aligned_size,
	+ const unsigned wqe_idx,
	+ const int dedic_bf,
	+ const int one_thread_auto_evict)
	+{
	+ convert_to_bf_wqe(qp, ctrl, wqe_idx);
	+
	+ if (dedic_bf && one_thread_auto_evict)
	+ /*
	+ * In case QP has dedicated BF, only one thread using this QP
	+ * and the CPU arch supports auto eviction of WC buffer we can move
	+ * the wc_wmb before the bf_copy (usually it is located after the bf_copy).
	+ * This provides significant improvement in message rate of small messages.
	+ * This barrier keeps BF toggling order by ensuring that previous BF data
	+ * is written to memory before writing to the next BF buffer.
	+ */
	+ wc_wmb();
	+ else
	+ /*
	+ * Make sure that descriptor is written to memory
	+ * before writing to BlueFlame page.
	+ */
	+ wmb();
	+
	+ if (dedic_bf) {
	+ mlx4_bf_copy(qp->bf->dedic.address, (uint64_t *) ctrl, aligned_size);
	+ } else {
	+ mlx4_lock(&qp->bf->cmn.lock);
	+ mlx4_bf_copy(qp->bf->cmn.address, (uint64_t *) ctrl, aligned_size);
	+ }
	+ if (!(dedic_bf && one_thread_auto_evict))
	+ /*
	+ * This barrier ensures that BF data is written to memory
	+ * before toggling the BF buffer. This is to keep the right
	+ * toggling order and to prevent the case in which next BF data
	+ * will be written before the current BF data.
	+ * In addition this barrier ensures the eviction of the WC buffer.
	+ * See comment above for the conditions in which this barrier may be
	+ * set before the bf_copy.
	+ */
	+ wc_wmb();
	+
	+ if (dedic_bf) {
	+ /* Toggle BF buffer */
	+ qp->bf->dedic.address = (void *)((uintptr_t)qp->bf->dedic.address ^ qp->bf_buf_size);
	+ } else {
	+ /* Toggle BF buffer */
	+ qp->bf->cmn.address = (void *)((uintptr_t)qp->bf->cmn.address ^ qp->bf_buf_size);
	+ mlx4_unlock(&qp->bf->cmn.lock);
	+ }
	+}
	+
	+static inline void __ring_db(struct mlx4_qp qp, struct mlx4_wqe_ctrl_seg ctrl,
	+ int nreq, int size, int inl,
	+ const int use_bf, const int dedic_bf, const int one_thread_auto_evict,
	+ const int prefer_bf) __attribute__((always_inline));
	+static inline void __ring_db(struct mlx4_qp qp, struct mlx4_wqe_ctrl_seg ctrl,
	+ int nreq, int size, int inl,
	+ const int use_bf, const int dedic_bf, const int one_thread_auto_evict,
	+ const int prefer_bf)
	+{
	+ if (use_bf && nreq == 1 && (inl \|\| prefer_bf) &&
	+ size > 1 && size <= qp->bf_buf_size / 16) {
	+ copy_wqe_to_bf(qp, ctrl, align(size * 16, 64),
	+ qp->sq.head , dedic_bf,
	+ one_thread_auto_evict);
	+ ++qp->sq.head;
	+ } else if (likely(nreq)) {
	+ qp->sq.head += nreq;
	+
	+ /*
	+ * Make sure that descriptors are written before
	+ * ringing non-cached doorbell record.
	+ */
	+ nc_wmb();
	+ *qp->sdb = qp->doorbell_qpn;
	+ }
	+}
	+
	+static void __ring_db_mng(struct mlx4_qp qp, struct mlx4_wqe_ctrl_seg ctrl,
	+ int nreq, int size, int inl) __attribute__((noinline));
	+static void __ring_db_mng(struct mlx4_qp qp, struct mlx4_wqe_ctrl_seg ctrl,
	+ int nreq, int size, int inl)
	+{
	+ struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context);
	+
	+ if (nreq == 1 && (inl \|\| ctx->prefer_bf) && size > 1 && size <= qp->bf_buf_size / 16) {
	+ convert_to_bf_wqe(qp, ctrl, qp->sq.head);
	+
	+ /*
	+ * Make sure that descriptor is written to memory
	+ * before writing to BlueFlame page.
	+ */
	+ wmb();
	+
	+ ++qp->sq.head;
	+
	+ wmb();
	+
	+ } else if (likely(nreq)) {
	+ qp->sq.head += nreq;
	+
	+ /* Controlled qp */
	+ wmb();
	+ }
	+}
	+
	+static inline void ring_db(struct mlx4_qp qp, struct mlx4_wqe_ctrl_seg ctrl,
	+ int nreq, int size, int inl) __attribute__((always_inline));
	+static inline void ring_db(struct mlx4_qp qp, struct mlx4_wqe_ctrl_seg ctrl,
	+ int nreq, int size, int inl)
	+{
	+ if (unlikely(qp->create_flags & IBV_EXP_QP_CREATE_MANAGED_SEND))
	+ return __ring_db_mng(qp, ctrl, nreq, size, inl);
	+
	+ switch (qp->db_method) {
	+ case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB:
	+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 1);
	+ case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB:
	+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 0);
	+ case MLX4_QP_DB_METHOD_DEDIC_BF:
	+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
	+ case MLX4_QP_DB_METHOD_BF:
	+ return __ring_db(qp, ctrl, nreq, size, inl, 1, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
	+ case MLX4_QP_DB_METHOD_DB:
	+ return __ring_db(qp, ctrl, nreq, size, inl, 0, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
	+ }
	+}
	+
	+static void set_ctrl_seg(struct mlx4_wqe_ctrl_seg ctrl, struct ibv_send_wr wr,
	+ struct mlx4_qp *qp, uint32_t imm, uint32_t srcrb_flags,
	+ unsigned int owner_bit, int size, uint32_t wr_op)
	+{
	+ ctrl->srcrb_flags = srcrb_flags;
	+ ctrl->imm = imm;
	+ ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
	+ MLX4_WQE_CTRL_FENCE : 0) \| size;
	+
	+ /*
	+ * Make sure descriptor is fully written before
	+ * setting ownership bit (because HW can start
	+ * executing as soon as we do).
	+ */
	+ wmb();
	+ ctrl->owner_opcode = htonl(wr_op) \| owner_bit;
	+}
	+
	+static inline int set_data_inl_seg(struct mlx4_qp qp, int num_sge, struct ibv_sge sg_list,
	+ void wqe, int size, unsigned int owner_bit) __attribute__((always_inline));
	+static inline int set_data_inl_seg(struct mlx4_qp qp, int num_sge, struct ibv_sge sg_list,
	+ void wqe, int size, unsigned int owner_bit)
	+{
	+ struct mlx4_wqe_inline_seg *seg;
	+ void *addr;
	+ int len, seg_len;
	+ int num_seg;
	+ int off, to_copy;
	+ int i;
	+ int inl = 0;
	+
	+ seg = wqe;
	+ wqe += sizeof(*seg);
	+ off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
	+ num_seg = 0;
	+ seg_len = 0;
	+
	+ for (i = 0; i < num_sge; ++i) {
	+ addr = (void *) (uintptr_t) sg_list[i].addr;
	+ len = sg_list[i].length;
	+ inl += len;
	+
	+ if (unlikely(inl > qp->max_inline_data))
	+ return ENOMEM;
	+
	+ while (len >= MLX4_INLINE_ALIGN - off) {
	+ to_copy = MLX4_INLINE_ALIGN - off;
	+ memcpy(wqe, addr, to_copy);
	+ len -= to_copy;
	+ wqe += to_copy;
	+ addr += to_copy;
	+ seg_len += to_copy;
	+ wmb(); /* see comment below */
	+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG \| seg_len));
	+ seg_len = 0;
	+ seg = wqe;
	+ wqe += sizeof(*seg);
	+ off = sizeof(*seg);
	+ ++num_seg;
	+ }
	+
	+ memcpy(wqe, addr, len);
	+ wqe += len;
	+ seg_len += len;
	+ off += len;
	+ }
	+
	+ if (likely(seg_len)) {
	+ ++num_seg;
	+ /*
	+ * Need a barrier here to make sure
	+ * all the data is visible before the
	+ * byte_count field is set. Otherwise
	+ * the HCA prefetcher could grab the
	+ * 64-byte chunk with this inline
	+ * segment and get a valid (!=
	+ * 0xffffffff) byte count but stale
	+ * data, and end up sending the wrong
	+ * data.
	+ */
	+ wmb();
	+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG \| seg_len));
	+ }
	+
	+ size += (inl + num_seg sizeof(*seg) + 15) / 16;
	+
	+ return 0;
	+}
	+
	+static inline void set_data_inl_seg_fast(struct mlx4_qp *qp,
	+ void *addr, int length,
	+ void wqe, int size,
	+ unsigned int owner_bit) __attribute__((always_inline));
	+static inline void set_data_inl_seg_fast(struct mlx4_qp *qp,
	+ void *addr, int length,
	+ void wqe, int size,
	+ unsigned int owner_bit)
	+{
	+ struct mlx4_wqe_inline_seg *seg;
	+ static const int first_seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg) - sizeof(struct mlx4_wqe_ctrl_seg);
	+ static const int seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg);
	+
	+ seg = wqe;
	+ wqe += sizeof(*seg);
	+
	+ if (length <= first_seg_data_size) {
	+ /* For the first segment there is no need to make sure
	+ * all the data is visible before the byte_count field is set.
	+ * This is because the ctrl segment at the beginning of the
	+ * segment covers HCA prefetcher issue.
	+ */
	+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG \| length));
	+
	+ memcpy(wqe, addr, length);
	+ size += (length + sizeof(seg) + 15) / 16;
	+ } else {
	+ void *start_wqe = seg;
	+
	+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG \| first_seg_data_size));
	+ memcpy(wqe, addr, first_seg_data_size);
	+ length -= first_seg_data_size;
	+ addr += first_seg_data_size;
	+ seg = (struct mlx4_wqe_inline_seg )((char )seg + MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg));
	+ wqe += MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg);
	+
	+ while (length > seg_data_size) {
	+ memcpy(wqe, addr, seg_data_size);
	+ wmb(); /* see comment below */
	+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG \| seg_data_size));
	+ length -= seg_data_size ;
	+ addr += seg_data_size;
	+ seg = (struct mlx4_wqe_inline_seg )((char )seg + MLX4_INLINE_ALIGN);
	+ wqe += MLX4_INLINE_ALIGN;
	+ }
	+ memcpy(wqe, addr, length);
	+
	+ /*
	+ * Need a barrier here to make sure
	+ * all the data is visible before the
	+ * byte_count field is set. Otherwise
	+ * the HCA prefetcher could grab the
	+ * 64-byte chunk with this inline
	+ * segment and get a valid (!=
	+ * 0xffffffff) byte count but stale
	+ * data, and end up sending the wrong
	+ * data.
	+ */
	+ wmb();
	+ seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG \| length));
	+ *size += (wqe + length - start_wqe + 15) / 16;
	+ }
	+}
	+
	+static inline void set_data_non_inl_seg(struct mlx4_qp qp, int num_sge, struct ibv_sge sg_list,
	+ void wqe, int size, unsigned int owner_bit) __attribute__((always_inline));
	+static inline void set_data_non_inl_seg(struct mlx4_qp qp, int num_sge, struct ibv_sge sg_list,
	+ void wqe, int size, unsigned int owner_bit)
	+{
	+ if (likely(num_sge == 1)) {
	+ struct mlx4_wqe_data_seg *seg = wqe;
	+
	+ set_ptr_data(seg, sg_list, owner_bit);
	+
	+ size += (sizeof(seg) / 16);
	+ } else {
	+ struct mlx4_wqe_data_seg *seg = wqe;
	+ int i;
	+
	+ for (i = num_sge - 1; i >= 0 ; --i)
	+ set_ptr_data(seg + i, sg_list + i, owner_bit);
	+
	+ size += num_sge (sizeof(*seg) / 16);
	+ }
	+}
	+
	+static inline int set_data_seg(struct mlx4_qp qp, void seg, int *sz, int is_inl,
	+ int num_sge, struct ibv_sge sg_list, int inl,
	+ unsigned int owner_bit) __attribute__((always_inline));
	+static inline int set_data_seg(struct mlx4_qp qp, void seg, int *sz, int is_inl,
	+ int num_sge, struct ibv_sge sg_list, int inl,
	+ unsigned int owner_bit)
	+{
	+ if (is_inl) {
	+ /* inl is set to true if this is an inline data segment and num_sge > 0 */
	+ *inl = num_sge > 0;
	+ return set_data_inl_seg(qp, num_sge, sg_list, seg, sz,
	+ owner_bit);
	+ }
	+ set_data_non_inl_seg(qp, num_sge, sg_list, seg, sz, owner_bit);
	+
	+ return 0;
	+}
	+
	+static inline int set_common_segments(struct ibv_send_wr wr, struct mlx4_qp qp,
	+ uint32_t srcrb_flags, uint32_t imm,
	+ void wqe, void ctrl, int size, int *total_size,
	+ int *inl, unsigned int ind) __attribute__((always_inline));
	+static inline int set_common_segments(struct ibv_send_wr wr, struct mlx4_qp qp,
	+ uint32_t srcrb_flags, uint32_t imm,
	+ void wqe, void ctrl, int size, int *total_size,
	+ int *inl, unsigned int ind)
	+{
	+ int ret;
	+ unsigned int owner_bit = (ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0;
	+
	+ ret = set_data_seg(qp, wqe, &size, !!(wr->send_flags & IBV_SEND_INLINE),
	+ wr->num_sge, wr->sg_list, inl, owner_bit);
	+ if (unlikely(ret))
	+ return ret;
	+
	+ *total_size = size;
	+ set_ctrl_seg(ctrl, wr, qp, imm, srcrb_flags, owner_bit, size,
	+ mlx4_ib_opcode[wr->opcode]);
	+
	+ return 0;
	+
	+}
	+
	+static int post_send_other(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind)
	+{
	+ void *ctrl = wqe_add;
	+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
	+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
	+ int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED \|
	+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
	+ uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
	+ uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM \|\|
	+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
	+ ? wr->imm_data : 0;
	+
	+ return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
	+
	+}
	+
	+static int post_send_rc_raw_packet(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind)
	+{
	+ void *ctrl = wqe_add;
	+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
	+ union {
	+ uint32_t srcrb_flags;
	+ uint16_t srcrb_flags16[2];
	+ } u;
	+ uint32_t imm;
	+ int idx;
	+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
	+
	+ /* Sanity check - prevent from posting empty SR */
	+ if (unlikely(!wr->num_sge))
	+ return EINVAL;
	+
	+ if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) {
	+ /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
	+ * to indicate that no icrc should be calculated */
	+ idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED;
	+ u.srcrb_flags = htonl((uint32_t)(qp->srcrb_flags_tbl[idx] \| MLX4_WQE_CTRL_SOLICIT));
	+ /* For raw eth, take the dmac from the payload */
	+ u.srcrb_flags16[0] = (uint16_t )(uintptr_t)wr->sg_list[0].addr;
	+ imm = (uint32_t )((uintptr_t)(wr->sg_list[0].addr)+2);
	+ } else {
	+ idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED \|
	+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
	+ u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
	+
	+ imm = (wr->opcode == IBV_WR_SEND_WITH_IMM \|\|
	+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
	+ ? wr->imm_data : 0;
	+ }
	+
	+ return set_common_segments(wr, qp, u.srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
	+}
	+
	+static int post_send_ud(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind)
	+{
	+ void *ctrl = wqe_add;
	+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
	+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
	+ int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED \|
	+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
	+ uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
	+ uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM \|\|
	+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
	+ ? wr->imm_data : 0;
	+
	+ set_datagram_seg(wqe, wr);
	+ wqe += sizeof(struct mlx4_wqe_datagram_seg);
	+ size += sizeof(struct mlx4_wqe_datagram_seg) / 16;
	+
	+ return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
	+}
	+
	+static inline int post_send_connected(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind, int is_xrc) __attribute__((always_inline));
	+static inline int post_send_connected(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind, int is_xrc)
	+{
	+ void *ctrl = wqe_add;
	+ void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
	+ uint32_t srcrb_flags;
	+ uint32_t imm = 0;
	+ int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
	+ int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED \|
	+ (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
	+
	+ if (is_xrc)
	+ srcrb_flags = htonl((wr->qp_type.xrc.remote_srqn << 8) \|
	+ (qp->srcrb_flags_tbl[idx]));
	+ else
	+ srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
	+
	+ switch (wr->opcode) {
	+ case IBV_WR_ATOMIC_CMP_AND_SWP:
	+ case IBV_WR_ATOMIC_FETCH_AND_ADD:
	+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
	+ wr->wr.atomic.rkey);
	+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
	+
	+ set_atomic_seg(wqe, (struct ibv_exp_send_wr *)wr);
	+ wqe += sizeof(struct mlx4_wqe_atomic_seg);
	+ size += (sizeof(struct mlx4_wqe_raddr_seg) +
	+ sizeof(struct mlx4_wqe_atomic_seg)) / 16;
	+
	+ break;
	+
	+ case IBV_WR_SEND_WITH_IMM:
	+ imm = wr->imm_data;
	+ break;
	+
	+ case IBV_WR_RDMA_WRITE_WITH_IMM:
	+ imm = wr->imm_data;
	+ if (!wr->num_sge)
	+ *inl = 1;
	+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
	+ wr->wr.rdma.rkey);
	+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
	+ size += sizeof(struct mlx4_wqe_raddr_seg) / 16;
	+ break;
	+
	+ case IBV_WR_RDMA_READ:
	+ *inl = 1;
	+ /* fall through */
	+ case IBV_WR_RDMA_WRITE:
	+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
	+ wr->wr.rdma.rkey);
	+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
	+ size += sizeof(struct mlx4_wqe_raddr_seg) / 16;
	+
	+ break;
	+
	+ case IBV_WR_SEND:
	+ break;
	+
	+ default:
	+ /* No extra segments required for sends */
	+ break;
	+ }
	+
	+ return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
	+}
	+
	+static int post_send_rc_uc(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind)
	+{
	+ return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 0);
	+}
	+
	+static int post_send_xrc(struct ibv_send_wr *wr,
	+ struct mlx4_qp *qp,
	+ void wqe_add, int total_size,
	+ int *inl, unsigned int ind)
	+{
	+ return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 1);
	+}
	+
	+void mlx4_update_post_send_one(struct mlx4_qp *qp)
	+{
	+ switch (qp->qp_type) {
	+ case IBV_QPT_XRC_SEND:
	+ case IBV_QPT_XRC:
	+ qp->post_send_one = post_send_xrc;
	+ break;
	+ case IBV_QPT_RC:
	+ case IBV_QPT_UC:
	+ qp->post_send_one = post_send_rc_uc;
	+ break;
	+ case IBV_QPT_UD:
	+ qp->post_send_one = post_send_ud;
	+ break;
	+
	+ case IBV_QPT_RAW_PACKET:
	+ qp->post_send_one = post_send_rc_raw_packet;
	+ break;
	+
	+ default:
	+ qp->post_send_one = post_send_other;
	+ break;
	}
	}

	int mlx4_post_send(struct ibv_qp ibqp, struct ibv_send_wr wr,
	- struct ibv_send_wr **bad_wr)
	+ struct ibv_send_wr **bad_wr)
	+{
	+ struct mlx4_qp *qp = to_mqp(ibqp);
	+ void *uninitialized_var(ctrl);
	+ unsigned int ind;
	+ int nreq;
	+ int inl = 0;
	+ int ret = 0;
	+ int size = 0;
	+
	+ mlx4_lock(&qp->sq.lock);
	+
	+ /* XXX check that state is OK to post send */
	+
	+ ind = qp->sq.head;
	+
	+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
	+ /* to be considered whether can throw first check, create_qp_exp with post_send */
	+ if (!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW))
	+ if (unlikely(wq_overflow(&qp->sq, nreq, qp))) {
	+ ret = ENOMEM;
	+ errno = ret;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+
	+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
	+ ret = ENOMEM;
	+ errno = ret;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+
	+ if (unlikely(wr->opcode >= sizeof(mlx4_ib_opcode) / sizeof(mlx4_ib_opcode[0]))) {
	+ ret = EINVAL;
	+ errno = ret;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+
	+ ctrl = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
	+ qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
	+
	+ ret = qp->post_send_one(wr, qp, ctrl, &size, &inl, ind);
	+ if (unlikely(ret)) {
	+ inl = 0;
	+ errno = ret;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+ /*
	+ * We can improve latency by not stamping the last
	+ * send queue WQE until after ringing the doorbell, so
	+ * only stamp here if there are still more WQEs to post.
	+ */
	+ if (likely(wr->next))
	+#ifndef MLX4_WQE_FORMAT
	+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
	+ (qp->sq.wqe_cnt - 1));
	+#else
	+ /* Make sure all owners bits are set to HW ownership */
	+ set_owner_wqe(qp, ind, size,
	+ ((ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0));
	+#endif
	+
	+ ++ind;
	+ }
	+
	+out:
	+ ring_db(qp, ctrl, nreq, size, inl);
	+
	+ if (likely(nreq))
	+#ifndef MLX4_WQE_FORMAT
	+ stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
	+ (qp->sq.wqe_cnt - 1));
	+#else
	+ set_owner_wqe(qp, ind - 1, size,
	+ ((ind - 1) & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0));
	+#endif
	+ mlx4_unlock(&qp->sq.lock);
	+
	+ return ret;
	+}
	+
	+int mlx4_exp_post_send(struct ibv_qp ibqp, struct ibv_exp_send_wr wr,
	+ struct ibv_exp_send_wr **bad_wr)
	{
	- struct mlx4_context *ctx;
	struct mlx4_qp *qp = to_mqp(ibqp);
	void *wqe;
	- struct mlx4_wqe_ctrl_seg *ctrl;
	- int ind;
	+ void *uninitialized_var(ctrl);
	+ union {
	+ uint32_t srcrb_flags;
	+ uint16_t srcrb_flags16[2];
	+ } u;
	+ uint32_t imm;
	+ int idx;
	+ unsigned int ind;
	+ int uninitialized_var(owner_bit);
	int nreq;
	int inl = 0;
	int ret = 0;
	- int size;
	- int i;
	+ int size = 0;
	+ uint32_t mlx4_wr_op;
	+ uint64_t exp_send_flags;

	- pthread_spin_lock(&qp->sq.lock);
	+ mlx4_lock(&qp->sq.lock);

	/* XXX check that state is OK to post send */

	ind = qp->sq.head;

	for (nreq = 0; wr; ++nreq, wr = wr->next) {
	- if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
	- ret = -1;
	+ exp_send_flags = wr->exp_send_flags;
	+
	+ if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW) &&
	+ wq_overflow(&qp->sq, nreq, qp))) {
	+ ret = ENOMEM;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+
	+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
	+ ret = ENOMEM;
	*bad_wr = wr;
	goto out;
	}

	- if (wr->num_sge > qp->sq.max_gs) {
	- ret = -1;
	+ if (unlikely(wr->exp_opcode >= sizeof(mlx4_ib_opcode_exp) / sizeof(mlx4_ib_opcode_exp[0]))) {
	+ ret = EINVAL;
	*bad_wr = wr;
	goto out;
	}

	- if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
	- ret = -1;
	+ if (((MLX4_IB_OPCODE_GET_CLASS(mlx4_ib_opcode_exp[wr->exp_opcode]) == MLX4_OPCODE_MANAGED) \|\|
	+ (exp_send_flags & IBV_EXP_SEND_WITH_CALC)) &&
	+ !(qp->create_flags & IBV_EXP_QP_CREATE_CROSS_CHANNEL)) {
	+ ret = EINVAL;
	*bad_wr = wr;
	goto out;
	}

	+ mlx4_wr_op = MLX4_IB_OPCODE_GET_OP(mlx4_ib_opcode_exp[wr->exp_opcode]);
	+
	ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
	qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
	+ owner_bit = ind & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0;

	- ctrl->xrcrb_flags =
	- (wr->send_flags & IBV_SEND_SIGNALED ?
	- htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) \|
	- (wr->send_flags & IBV_SEND_SOLICITED ?
	- htonl(MLX4_WQE_CTRL_SOLICIT) : 0) \|
	- qp->sq_signal_bits;
	+ idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED \|
	+ (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) \|
	+ (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2);
	+ u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);

	- if (wr->opcode == IBV_WR_SEND_WITH_IMM \|\|
	- wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
	- ctrl->imm = wr->imm_data;
	- else
	- ctrl->imm = 0;
	+ imm = (MLX4_IB_OPCODE_GET_ATTR(mlx4_ib_opcode_exp[wr->exp_opcode]) & MLX4_OPCODE_WITH_IMM ?
	+ wr->ex.imm_data : 0);

	- wqe += sizeof *ctrl;
	- size = sizeof *ctrl / 16;
	+ wqe += sizeof(struct mlx4_wqe_ctrl_seg);
	+ size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;

	- switch (ibqp->qp_type) {
	+ switch (qp->qp_type) {
	+ case IBV_QPT_XRC_SEND:
	case IBV_QPT_XRC:
	- ctrl->xrcrb_flags \|= htonl(wr->xrc_remote_srq_num << 8);
	- /* fall thru */
	+ u.srcrb_flags \|= MLX4_REMOTE_SRQN_FLAGS(wr);
	+ /* fall through */
	case IBV_QPT_RC:
	case IBV_QPT_UC:
	- switch (wr->opcode) {
	- case IBV_WR_ATOMIC_CMP_AND_SWP:
	- case IBV_WR_ATOMIC_FETCH_AND_ADD:
	- set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
	- wr->wr.atomic.rkey);
	+ switch (wr->exp_opcode) {
	+ case IBV_EXP_WR_ATOMIC_CMP_AND_SWP:
	+ case IBV_EXP_WR_ATOMIC_FETCH_AND_ADD:
	+ case IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD:
	+ if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) {
	+ if (!qp->is_masked_atomic) {
	+ ret = EINVAL;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+ set_raddr_seg(wqe,
	+ wr->ext_op.masked_atomics.remote_addr,
	+ wr->ext_op.masked_atomics.rkey);
	+ } else {
	+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
	+ wr->wr.atomic.rkey);
	+ }
	wqe += sizeof (struct mlx4_wqe_raddr_seg);

	set_atomic_seg(wqe, wr);
	@@ -264,184 +1239,259 @@

	break;

	- case IBV_WR_RDMA_READ:
	+ case IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP:
	+ if (!qp->is_masked_atomic) {
	+ ret = EINVAL;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+ set_raddr_seg(wqe,
	+ wr->ext_op.masked_atomics.remote_addr,
	+ wr->ext_op.masked_atomics.rkey);
	+ wqe += sizeof(struct mlx4_wqe_raddr_seg);
	+
	+ set_masked_atomic_seg(wqe, wr);
	+ wqe += sizeof(struct mlx4_wqe_masked_atomic_seg);
	+ size += (sizeof(struct mlx4_wqe_raddr_seg) +
	+ sizeof(struct mlx4_wqe_masked_atomic_seg)) / 16;
	+ break;
	+
	+ case IBV_EXP_WR_RDMA_READ:
	inl = 1;
	/* fall through */
	- case IBV_WR_RDMA_WRITE:
	- case IBV_WR_RDMA_WRITE_WITH_IMM:
	- set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
	- wr->wr.rdma.rkey);
	+ case IBV_EXP_WR_RDMA_WRITE_WITH_IMM:
	+ if (!wr->num_sge)
	+ inl = 1;
	+ /* fall through */
	+ case IBV_EXP_WR_RDMA_WRITE:
	+ if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) {
	+
	+ if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER \|\|
	+ (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER \|\|
	+ (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER \|\|
	+ !mlx4_calc_ops_table
	+ [wr->op.calc.data_size]
	+ [wr->op.calc.calc_op]
	+ [wr->op.calc.data_type].valid) {
	+ ret = -1;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+
	+ mlx4_wr_op = MLX4_OPCODE_CALC_RDMA_WRITE_IMM \|
	+ mlx4_calc_ops_table
	+ [wr->op.calc.data_size]
	+ [wr->op.calc.calc_op]
	+ [wr->op.calc.data_type].opcode;
	+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
	+ wr->wr.rdma.rkey);
	+
	+ } else {
	+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
	+ wr->wr.rdma.rkey);
	+ }
	wqe += sizeof (struct mlx4_wqe_raddr_seg);
	size += sizeof (struct mlx4_wqe_raddr_seg) / 16;

	break;

	- default:
	- /* No extra segments required for sends */
	+ case IBV_EXP_WR_LOCAL_INV:
	+ u.srcrb_flags \|= htonl(MLX4_WQE_CTRL_STRONG_ORDER);
	+ set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
	+ wqe += sizeof
	+ (struct mlx4_wqe_local_inval_seg);
	+ size += sizeof
	+ (struct mlx4_wqe_local_inval_seg) / 16;
	break;
	- }
	- break;
	-
	- case IBV_QPT_UD:
	- set_datagram_seg(wqe, wr);
	- wqe += sizeof (struct mlx4_wqe_datagram_seg);
	- size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
	- if (to_mah(wr->wr.ud.ah)->tagged) {
	- ctrl->ins_vlan = 1 << 6;
	- ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan);
	- }

	- break;
	+ case IBV_EXP_WR_BIND_MW:
	+ u.srcrb_flags \|= htonl(MLX4_WQE_CTRL_STRONG_ORDER);
	+ set_bind_seg(wqe, wr);
	+ wqe += sizeof
	+ (struct mlx4_wqe_bind_seg);
	+ size += sizeof
	+ (struct mlx4_wqe_bind_seg) / 16;
	+ break;

	- default:
	- break;
	- }
	+ case IBV_EXP_WR_SEND:
	+ if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) {
	+
	+ if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER \|\|
	+ (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER \|\|
	+ (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER \|\|
	+ !mlx4_calc_ops_table
	+ [wr->op.calc.data_size]
	+ [wr->op.calc.calc_op]
	+ [wr->op.calc.data_type].valid) {
	+ ret = -1;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+
	+ mlx4_wr_op = MLX4_OPCODE_CALC_SEND \|
	+ mlx4_calc_ops_table
	+ [wr->op.calc.data_size]
	+ [wr->op.calc.calc_op]
	+ [wr->op.calc.data_type].opcode;
	+ }

	- if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
	- struct mlx4_wqe_inline_seg *seg;
	- void *addr;
	- int len, seg_len;
	- int num_seg;
	- int off, to_copy;
	+ break;

	- inl = 0;
	+ case IBV_EXP_WR_CQE_WAIT:
	+ {
	+ struct mlx4_cq *wait_cq = to_mcq(wr->task.cqe_wait.cq);
	+ uint32_t wait_index = 0;

	- seg = wqe;
	- wqe += sizeof *seg;
	- off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
	- num_seg = 0;
	- seg_len = 0;
	+ wait_index = wait_cq->wait_index +
	+ wr->task.cqe_wait.cq_count;
	+ wait_cq->wait_count = max(wait_cq->wait_count,
	+ wr->task.cqe_wait.cq_count);

	- for (i = 0; i < wr->num_sge; ++i) {
	- addr = (void *) (uintptr_t) wr->sg_list[i].addr;
	- len = wr->sg_list[i].length;
	- inl += len;
	+ if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) {
	+ wait_cq->wait_index += wait_cq->wait_count;
	+ wait_cq->wait_count = 0;
	+ }

	- if (inl > qp->max_inline_data) {
	- inl = 0;
	- ret = -1;
	- *bad_wr = wr;
	- goto out;
	+ set_wait_en_seg(wqe, wait_cq->cqn, wait_index);
	+ wqe += sizeof(struct mlx4_wqe_wait_en_seg);
	+ size += sizeof(struct mlx4_wqe_wait_en_seg) / 16;
	}
	+ break;

	- while (len >= MLX4_INLINE_ALIGN - off) {
	- to_copy = MLX4_INLINE_ALIGN - off;
	- memcpy(wqe, addr, to_copy);
	- len -= to_copy;
	- wqe += to_copy;
	- addr += to_copy;
	- seg_len += to_copy;
	- wmb(); /* see comment below */
	- seg->byte_count = htonl(MLX4_INLINE_SEG \| seg_len);
	- seg_len = 0;
	- seg = wqe;
	- wqe += sizeof *seg;
	- off = sizeof *seg;
	- ++num_seg;
	+ case IBV_EXP_WR_SEND_ENABLE:
	+ case IBV_EXP_WR_RECV_ENABLE:
	+ {
	+ unsigned head_en_index;
	+ struct mlx4_wq *wq;
	+
	+ /*
	+ * Posting work request for QP that does not support
	+ * SEND/RECV ENABLE makes performance worse.
	+ */
	+ if (((wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) &&
	+ !(to_mqp(wr->task.wqe_enable.qp)->create_flags &
	+ IBV_EXP_QP_CREATE_MANAGED_SEND)) \|\|
	+ ((wr->exp_opcode == IBV_EXP_WR_RECV_ENABLE) &&
	+ !(to_mqp(wr->task.wqe_enable.qp)->create_flags &
	+ IBV_EXP_QP_CREATE_MANAGED_RECV))) {
	+ ret = -1;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+
	+ wq = (wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) ?
	+ &to_mqp(wr->task.wqe_enable.qp)->sq :
	+ &to_mqp(wr->task.wqe_enable.qp)->rq;
	+
	+ /* If wqe_count is 0 release all WRs from queue */
	+ if (wr->task.wqe_enable.wqe_count) {
	+ head_en_index = wq->head_en_index +
	+ wr->task.wqe_enable.wqe_count;
	+ wq->head_en_count = max(wq->head_en_count,
	+ wr->task.wqe_enable.wqe_count);
	+
	+ if ((int)(wq->head - head_en_index) < 0) {
	+ ret = -1;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+ } else {
	+ head_en_index = wq->head;
	+ wq->head_en_count = wq->head - wq->head_en_index;
	+ }
	+
	+ if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) {
	+ wq->head_en_index += wq->head_en_count;
	+ wq->head_en_count = 0;
	+ }
	+
	+ set_wait_en_seg(wqe,
	+ wr->task.wqe_enable.qp->qp_num,
	+ head_en_index);
	+
	+ wqe += sizeof(struct mlx4_wqe_wait_en_seg);
	+ size += sizeof(struct mlx4_wqe_wait_en_seg) / 16;
	}
	+ break;

	- memcpy(wqe, addr, len);
	- wqe += len;
	- seg_len += len;
	- off += len;
	- }
	+ case IBV_EXP_WR_SEND_WITH_INV:
	+ imm = htonl(wr->ex.invalidate_rkey);
	+ break;

	- if (seg_len) {
	- ++num_seg;
	- /*
	- * Need a barrier here to make sure
	- * all the data is visible before the
	- * byte_count field is set. Otherwise
	- * the HCA prefetcher could grab the
	- * 64-byte chunk with this inline
	- * segment and get a valid (!=
	- * 0xffffffff) byte count but stale
	- * data, and end up sending the wrong
	- * data.
	- */
	- wmb();
	- seg->byte_count = htonl(MLX4_INLINE_SEG \| seg_len);
	+ default:
	+ /* No extra segments required for sends */
	+ break;
	}
	+ break;

	- size += (inl + num_seg * sizeof * seg + 15) / 16;
	- } else {
	- struct mlx4_wqe_data_seg *seg = wqe;
	+ case IBV_QPT_UD:
	+ set_datagram_seg(wqe, (struct ibv_send_wr *)wr);
	+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
	+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
	+ break;

	- for (i = wr->num_sge - 1; i >= 0 ; --i)
	- set_data_seg(seg + i, wr->sg_list + i);
	+ case IBV_QPT_RAW_PACKET:
	+ /* Sanity check - prevent from posting empty SR */
	+ if (unlikely(!wr->num_sge)) {
	+ ret = EINVAL;
	+ *bad_wr = wr;
	+ goto out;
	+ }
	+ if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) {
	+ /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
	+ * to indicate that no icrc should be calculated */
	+ u.srcrb_flags \|= htonl(MLX4_WQE_CTRL_SOLICIT);
	+ /* For raw eth, take the dmac from the payload */
	+ u.srcrb_flags16[0] = (uint16_t )(uintptr_t)wr->sg_list[0].addr;
	+ imm = (uint32_t )((uintptr_t)(wr->sg_list[0].addr)+2);
	+ }
	+ break;

	- size += wr->num_sge * (sizeof *seg / 16);
	+ default:
	+ break;
	}

	- ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
	- MLX4_WQE_CTRL_FENCE : 0) \| size;
	-
	- /*
	- * Make sure descriptor is fully written before
	- * setting ownership bit (because HW can start
	- * executing as soon as we do).
	- */
	- wmb();
	-
	- ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) \|
	- (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
	+ ret = set_data_seg(qp, wqe, &size, !!(exp_send_flags & IBV_EXP_SEND_INLINE),
	+ wr->num_sge, wr->sg_list, &inl, owner_bit);
	+ if (unlikely(ret)) {
	+ inl = 0;
	+ *bad_wr = wr;
	+ goto out;
	+ }

	+ set_ctrl_seg(ctrl, (struct ibv_send_wr *)wr, qp, imm, u.srcrb_flags, owner_bit, size, mlx4_wr_op);
	/*
	* We can improve latency by not stamping the last
	* send queue WQE until after ringing the doorbell, so
	* only stamp here if there are still more WQEs to post.
	*/
	- if (wr->next)
	+ if (likely(wr->next))
	+#ifndef MLX4_WQE_FORMAT
	stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
	(qp->sq.wqe_cnt - 1));
	-
	+#else
	+ set_owner_wqe(qp, ind, size, owner_bit);
	+#endif
	++ind;
	}

	out:
	- ctx = to_mctx(ibqp->context);
	-
	- if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
	- ctrl->owner_opcode \|= htonl((qp->sq.head & 0xffff) << 8);
	- (uint32_t ) (&ctrl->vlan_tag) \|= qp->doorbell_qpn;
	- /*
	- * Make sure that descriptor is written to memory
	- * before writing to BlueFlame page.
	- */
	- wmb();
	-
	- ++qp->sq.head;
	-
	- pthread_spin_lock(&ctx->bf_lock);
	-
	- mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
	- align(size * 16, 64));
	- wc_wmb();
	-
	- ctx->bf_offset ^= ctx->bf_buf_size;
	-
	- pthread_spin_unlock(&ctx->bf_lock);
	- } else if (nreq) {
	- qp->sq.head += nreq;
	-
	- /*
	- * Make sure that descriptors are written before
	- * doorbell record.
	- */
	- wmb();
	-
	- (uint32_t ) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
	- }
	-
	- if (nreq)
	+ ring_db(qp, ctrl, nreq, size, inl);
	+ if (likely(nreq))
	+#ifndef MLX4_WQE_FORMAT
	stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
	(qp->sq.wqe_cnt - 1));
	+#else
	+ set_owner_wqe(qp, ind - 1, size, owner_bit);
	+#endif

	- pthread_spin_unlock(&qp->sq.lock);
	+ mlx4_unlock(&qp->sq.lock);

	return ret;
	}

	+
	+
	int mlx4_post_recv(struct ibv_qp ibqp, struct ibv_recv_wr wr,
	struct ibv_recv_wr **bad_wr)
	{
	@@ -449,24 +1499,25 @@
	struct mlx4_wqe_data_seg *scat;
	int ret = 0;
	int nreq;
	- int ind;
	+ unsigned int ind;
	int i;
	+ struct mlx4_inlr_rbuff *rbuffs;

	- pthread_spin_lock(&qp->rq.lock);
	+ mlx4_lock(&qp->rq.lock);

	/* XXX check that state is OK to post receive */
	-
	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);

	for (nreq = 0; wr; ++nreq, wr = wr->next) {
	- if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
	- ret = -1;
	+ if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW) &&
	+ wq_overflow(&qp->rq, nreq, qp))) {
	+ ret = ENOMEM;
	*bad_wr = wr;
	goto out;
	}

	- if (wr->num_sge > qp->rq.max_gs) {
	- ret = -1;
	+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
	+ ret = EINVAL;
	*bad_wr = wr;
	goto out;
	}
	@@ -476,11 +1527,20 @@
	for (i = 0; i < wr->num_sge; ++i)
	__set_data_seg(scat + i, wr->sg_list + i);

	- if (i < qp->rq.max_gs) {
	+ if (likely(i < qp->rq.max_gs)) {
	scat[i].byte_count = 0;
	scat[i].lkey = htonl(MLX4_INVALID_LKEY);
	scat[i].addr = 0;
	}
	+ if (qp->max_inlr_sg) {
	+ rbuffs = qp->inlr_buff.buff[ind].sg_list;
	+ qp->inlr_buff.buff[ind].list_len = wr->num_sge;
	+ for (i = 0; i < wr->num_sge; ++i) {
	+ rbuffs->rbuff = (void *)(unsigned long)(wr->sg_list[i].addr);
	+ rbuffs->rlen = wr->sg_list[i].length;
	+ rbuffs++;
	+ }
	+ }

	qp->rq.wrid[ind] = wr->wr_id;

	@@ -488,7 +1548,7 @@
	}

	out:
	- if (nreq) {
	+ if (likely(nreq)) {
	qp->rq.head += nreq;

	/*
	@@ -500,7 +1560,7 @@
	*qp->db = htonl(qp->rq.head & 0xffff);
	}

	- pthread_spin_unlock(&qp->rq.lock);
	+ mlx4_unlock(&qp->rq.lock);

	return ret;
	}
	@@ -533,6 +1593,7 @@
	struct mlx4_qp *qp)
	{
	int size;
	+ int atomic_size;
	int max_sq_sge;

	max_sq_sge = align(cap->max_inline_data +
	@@ -553,6 +1614,7 @@
	size += sizeof (struct mlx4_wqe_raddr_seg);
	break;

	+ case IBV_QPT_XRC_SEND:
	case IBV_QPT_XRC:
	case IBV_QPT_RC:
	size += sizeof (struct mlx4_wqe_raddr_seg);
	@@ -560,12 +1622,14 @@
	* An atomic op will require an atomic segment, a
	* remote address segment and one scatter entry.
	*/
	- if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
	- sizeof (struct mlx4_wqe_raddr_seg) +
	- sizeof (struct mlx4_wqe_data_seg)))
	- size = (sizeof (struct mlx4_wqe_atomic_seg) +
	- sizeof (struct mlx4_wqe_raddr_seg) +
	- sizeof (struct mlx4_wqe_data_seg));
	+ atomic_size = (qp->is_masked_atomic ?
	+ sizeof(struct mlx4_wqe_masked_atomic_seg) :
	+ sizeof(struct mlx4_wqe_atomic_seg)) +
	+ sizeof(struct mlx4_wqe_raddr_seg) +
	+ sizeof(struct mlx4_wqe_data_seg);
	+
	+ if (size < atomic_size)
	+ size = atomic_size;
	break;

	default:
	@@ -583,56 +1647,39 @@
	; /* nothing */
	}

	-int mlx4_alloc_qp_buf(struct ibv_pd pd, struct ibv_qp_cap cap,
	- enum ibv_qp_type type, struct mlx4_qp *qp)
	+int mlx4_use_huge(struct ibv_context context, const char key)
	{
	- qp->rq.max_gs = cap->max_recv_sge;
	+ char e[VERBS_MAX_ENV_VAL];

	- qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
	- if (!qp->sq.wrid)
	- return -1;
	+ if (!ibv_exp_cmd_getenv(context, key, e, sizeof(e)) && !strcmp(e, "y"))
	+ return 1;

	+ return 0;
	+}
	+
	+void mlx4_dealloc_qp_buf(struct ibv_context context, struct mlx4_qp qp)
	+{
	if (qp->rq.wqe_cnt) {
	- qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
	- if (!qp->rq.wrid) {
	- free(qp->sq.wrid);
	- return -1;
	+ free(qp->rq.wrid);
	+ if (qp->max_inlr_sg) {
	+ free(qp->inlr_buff.buff[0].sg_list);
	+ free(qp->inlr_buff.buff);
	}
	}
	-
	- for (qp->rq.wqe_shift = 4;
	- 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
	- qp->rq.wqe_shift++)
	- ; /* nothing */
	-
	- qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
	- (qp->sq.wqe_cnt << qp->sq.wqe_shift);
	- if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
	- qp->rq.offset = 0;
	- qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
	- } else {
	- qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
	- qp->sq.offset = 0;
	- }
	-
	- if (mlx4_alloc_buf(&qp->buf,
	- align(qp->buf_size, to_mdev(pd->context->device)->page_size),
	- to_mdev(pd->context->device)->page_size)) {
	+ if (qp->sq.wqe_cnt)
	free(qp->sq.wrid);
	- free(qp->rq.wrid);
	- return -1;
	- }

	- memset(qp->buf.buf, 0, qp->buf_size);
	-
	- return 0;
	+ if (qp->buf.hmem != NULL)
	+ mlx4_free_buf_huge(to_mctx(context), &qp->buf);
	+ else
	+ mlx4_free_buf(&qp->buf);
	}

	void mlx4_set_sq_sizes(struct mlx4_qp qp, struct ibv_qp_cap cap,
	enum ibv_qp_type type)
	{
	int wqe_size;
	- struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context);
	+ struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context);

	wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) -
	sizeof (struct mlx4_wqe_ctrl_seg);
	@@ -641,9 +1688,10 @@
	wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
	break;

	+ case IBV_QPT_XRC_SEND:
	+ case IBV_QPT_XRC:
	case IBV_QPT_UC:
	case IBV_QPT_RC:
	- case IBV_QPT_XRC:
	wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
	break;

	@@ -704,3 +1752,812 @@
	else
	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
	}
	+
	+int mlx4_post_task(struct ibv_context *context,
	+ struct ibv_exp_task *task_list,
	+ struct ibv_exp_task **bad_task)
	+{
	+ int rc = 0;
	+ struct ibv_exp_task *cur_task = NULL;
	+ struct ibv_exp_send_wr *bad_wr;
	+ struct mlx4_context *mlx4_ctx = to_mctx(context);
	+
	+ if (!task_list)
	+ return rc;
	+
	+ pthread_mutex_lock(&mlx4_ctx->task_mutex);
	+
	+ cur_task = task_list;
	+ while (!rc && cur_task) {
	+
	+ switch (cur_task->task_type) {
	+ case IBV_EXP_TASK_SEND:
	+ rc = ibv_exp_post_send(cur_task->item.qp,
	+ cur_task->item.send_wr,
	+ &bad_wr);
	+ break;
	+
	+ case IBV_EXP_TASK_RECV:
	+ rc = ibv_post_recv(cur_task->item.qp,
	+ cur_task->item.recv_wr,
	+ NULL);
	+ break;
	+
	+ default:
	+ rc = -1;
	+ }
	+
	+ if (rc && bad_task) {
	+ *bad_task = cur_task;
	+ break;
	+ }
	+
	+ cur_task = cur_task->next;
	+ }
	+
	+ pthread_mutex_unlock(&mlx4_ctx->task_mutex);
	+
	+ return rc;
	+}
	+
	+/*
	+ * family interfaces functions
	+ */
	+
	+/*
	+ * send_pending - is a general post send function that put one message in
	+ * the send queue. The function is not ringing the QP door-bell.
	+ *
	+ * User may call this function several times to fill send queue with
	+ * several messages, then he can call mlx4_send_flush to ring the QP DB
	+ *
	+ * This function is used to implement the following QP burst family functions:
	+ * - send_pending
	+ * - send_pending_inline
	+ * - send_pending_sg_list
	+ * - send_burst
	+ */
	+static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr,
	+ uint32_t length, uint32_t lkey,
	+ uint32_t flags,
	+ const int use_raw_eth, const int use_inl,
	+ const int thread_safe, const int wqe_64,
	+ const int use_sg_list, int num_sge,
	+ struct ibv_sge *sg_list,
	+ const int lb) __attribute__((always_inline));
	+static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr,
	+ uint32_t length, uint32_t lkey,
	+ uint32_t flags,
	+ const int use_raw_eth, const int use_inl,
	+ const int thread_safe, const int wqe_64,
	+ const int use_sg_list, int num_sge,
	+ struct ibv_sge *sg_list,
	+ const int lb)
	+{
	+ struct mlx4_qp *qp = to_mqp(ibqp);
	+ struct mlx4_wqe_ctrl_seg *ctrl;
	+ struct mlx4_wqe_data_seg *dseg;
	+ uint32_t tunnel_offload = 0;
	+ unsigned int owner_bit = qp->sq.head & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0;
	+ int size;
	+ int idx;
	+ int i;
	+
	+ if (thread_safe)
	+ mlx4_lock(&qp->sq.lock);
	+
	+ if (wqe_64)
	+ ctrl = get_send_wqe64(qp, qp->sq.head & (qp->sq.wqe_cnt - 1));
	+ else
	+ ctrl = get_send_wqe(qp, qp->sq.head & (qp->sq.wqe_cnt - 1));
	+
	+ dseg = (struct mlx4_wqe_data_seg )(((char )ctrl) + sizeof(struct mlx4_wqe_ctrl_seg));
	+
	+ if (use_sg_list) {
	+ for (i = num_sge - 1; i >= 0 ; --i)
	+ set_ptr_data(dseg + i, sg_list + i, owner_bit);
	+
	+ size = (sizeof(struct mlx4_wqe_ctrl_seg) + (num_sge * sizeof(struct mlx4_wqe_data_seg)))/ 16;
	+ } else {
	+ if (use_inl) {
	+ size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
	+ set_data_inl_seg_fast(qp, (void *)(uintptr_t)addr, length, dseg, &size, owner_bit);
	+ } else {
	+ size = (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg))/ 16;
	+ dseg->byte_count = SET_BYTE_COUNT(length);
	+ dseg->lkey = htonl(lkey);
	+ dseg->addr = htonll(addr);
	+ }
	+ }
	+
	+ if (use_raw_eth) {
	+ /* For raw eth, the SOLICIT flag is used
	+ * to indicate that no icrc should be calculated */
	+ idx = IBV_EXP_QP_BURST_SOLICITED \|
	+ (flags & (IBV_EXP_QP_BURST_SIGNALED \|
	+ IBV_EXP_QP_BURST_IP_CSUM \|
	+ IBV_EXP_QP_BURST_TUNNEL));
	+ tunnel_offload = flags & IBV_EXP_QP_BURST_TUNNEL ? MLX4_WQE_CTRL_IIP \| MLX4_WQE_CTRL_IL4 : 0;
	+ } else {
	+ idx = (flags & (IBV_EXP_QP_BURST_SIGNALED \|
	+ IBV_EXP_QP_BURST_SOLICITED \|
	+ IBV_EXP_QP_BURST_IP_CSUM));
	+ }
	+
	+ if (use_raw_eth && lb) {
	+ union {
	+ uint32_t srcrb_flags;
	+ uint16_t srcrb_flags16[2];
	+ } u;
	+
	+ u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
	+ /* For raw eth, take the dmac from the payload */
	+ if (use_sg_list)
	+ addr = sg_list[0].addr;
	+ u.srcrb_flags16[0] = (uint16_t )(uintptr_t)addr;
	+ ctrl->srcrb_flags = u.srcrb_flags;
	+ ctrl->imm = (uint32_t )((uintptr_t)(addr)+2);
	+ } else {
	+ ctrl->srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
	+ ctrl->imm = 0;
	+ }
	+ ctrl->fence_size = (flags & IBV_EXP_QP_BURST_FENCE ? MLX4_WQE_CTRL_FENCE : 0) \| size;
	+
	+ /*
	+ * Make sure descriptor is fully written before
	+ * setting ownership bit (because HW can start
	+ * executing as soon as we do).
	+ */
	+ wmb();
	+
	+ ctrl->owner_opcode = htonl(MLX4_OPCODE_SEND \| tunnel_offload) \| owner_bit;
	+ qp->sq.head++;
	+
	+ if (!wqe_64)
	+#ifndef MLX4_WQE_FORMAT
	+ stamp_send_wqe(qp, (qp->sq.head + qp->sq_spare_wqes) &
	+ (qp->sq.wqe_cnt - 1));
	+#else
	+ set_owner_wqe(qp, qp->sq.head, size, owner_bit);
	+#endif
	+ if (thread_safe)
	+ mlx4_unlock(&qp->sq.lock);
	+ else
	+ /*
	+ * Make sure that descriptors are written before
	+ * doorbell record.
	+ */
	+ wmb();
	+
	+ return 0;
	+}
	+
	+/* burst family - send_pending */
	+static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr,
	+ uint32_t length, uint32_t lkey,
	+ uint32_t flags, const int lb) __attribute__((always_inline));
	+static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr,
	+ uint32_t length, uint32_t lkey,
	+ uint32_t flags, const int lb)
	+{
	+ struct mlx4_qp *mqp = to_mqp(qp);
	+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET &&
	+ mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
	+ int wqe_64 = mqp->sq.wqe_shift == 6;
	+
	+ /* qp, addr, length, lkey, flags, raw_eth, inl, safe, */
	+ return send_pending(qp, addr, length, lkey, flags, raw_eth, 0, 1,
	+ /* wqe_64, use_sg, num_sge, sg_list, lb */
	+ wqe_64, 0, 0, NULL, lb);
	+}
	+
	+static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags)
	+{
	+ return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 1);
	+}
	+
	+static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags)
	+{
	+ return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 0);
	+}
	+
	+#define MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_unsafe_##eth##wqe64##lb
	+#define MLX4_SEND_PENDING_UNSAFE(eth, wqe64, lb) \
	+ static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp *qp, uint64_t addr, \
	+ uint32_t length, uint32_t lkey, \
	+ uint32_t flags) __MLX4_ALGN_FUNC__; \
	+ static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp *qp, uint64_t addr, \
	+ uint32_t length, uint32_t lkey, \
	+ uint32_t flags) \
	+ { \
	+ /* qp, addr, length, lkey, flags, eth, inl, */ \
	+ return send_pending(qp, addr, length, lkey, flags, eth, 0, \
	+ /* safe, wqe_64, use_sg, num_sge, sg_list */ \
	+ 0, wqe64, 0, 0, NULL, \
	+ /* lb */ \
	+ lb); \
	+ }
	+/* eth, wqe64, lb */
	+MLX4_SEND_PENDING_UNSAFE(0, 0, 0);
	+MLX4_SEND_PENDING_UNSAFE(0, 0, 1);
	+MLX4_SEND_PENDING_UNSAFE(0, 1, 0);
	+MLX4_SEND_PENDING_UNSAFE(0, 1, 1);
	+MLX4_SEND_PENDING_UNSAFE(1, 0, 0);
	+MLX4_SEND_PENDING_UNSAFE(1, 0, 1);
	+MLX4_SEND_PENDING_UNSAFE(1, 1, 0);
	+MLX4_SEND_PENDING_UNSAFE(1, 1, 1);
	+
	+/* burst family - send_pending_inline */
	+static inline int mlx4_send_pending_inl_safe(struct ibv_qp qp, void addr,
	+ uint32_t length, uint32_t flags,
	+ const int lb) __attribute__((always_inline));
	+static inline int mlx4_send_pending_inl_safe(struct ibv_qp qp, void addr,
	+ uint32_t length, uint32_t flags,
	+ const int lb)
	+{
	+ struct mlx4_qp *mqp = to_mqp(qp);
	+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
	+ int wqe_64 = mqp->sq.wqe_shift == 6;
	+
	+ /* qp, addr, length, lkey, flags, raw_eth, */
	+ return send_pending(qp, (uintptr_t)addr, length, 0, flags, raw_eth,
	+ /* inl, safe, wqe_64, use_sg, num_sge, sg_list, lb */
	+ 1, 1, wqe_64, 0, 0, NULL, lb);
	+}
	+
	+static int mlx4_send_pending_inl_safe_lb(struct ibv_qp qp, void addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_pending_inl_safe_lb(struct ibv_qp qp, void addr, uint32_t length, uint32_t flags)
	+{
	+ return mlx4_send_pending_inl_safe(qp, addr, length, flags, 1);
	+}
	+
	+static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp qp, void addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp qp, void addr, uint32_t length, uint32_t flags)
	+{
	+ return mlx4_send_pending_inl_safe(qp, addr, length, flags, 0);
	+}
	+
	+#define MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_inl_unsafe_##eth##wqe64##lb
	+#define MLX4_SEND_PENDING_INL_UNSAFE(eth, wqe64, lb) \
	+ static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp qp, void addr, \
	+ uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__; \
	+ static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp qp, void addr, \
	+ uint32_t length, uint32_t flags) \
	+ { \
	+ /* qp, addr, length, lkey, flags, eth, inl, */ \
	+ return send_pending(qp, (uintptr_t)addr, length, 0, flags, eth, 1, \
	+ /* safe, wqe_64, use_sg, num_sge, sg_list, lb */ \
	+ 0, wqe64, 0, 0, NULL, lb); \
	+ }
	+/* eth, wqe64, lb */
	+MLX4_SEND_PENDING_INL_UNSAFE(0, 0, 0);
	+MLX4_SEND_PENDING_INL_UNSAFE(0, 0, 1);
	+MLX4_SEND_PENDING_INL_UNSAFE(0, 1, 0);
	+MLX4_SEND_PENDING_INL_UNSAFE(0, 1, 1);
	+MLX4_SEND_PENDING_INL_UNSAFE(1, 0, 0);
	+MLX4_SEND_PENDING_INL_UNSAFE(1, 0, 1);
	+MLX4_SEND_PENDING_INL_UNSAFE(1, 1, 0);
	+MLX4_SEND_PENDING_INL_UNSAFE(1, 1, 1);
	+
	+/* burst family - send_pending_sg_list */
	+static inline int mlx4_send_pending_sg_list_safe(
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ uint32_t flags, const int lb) __attribute__((always_inline));
	+static inline int mlx4_send_pending_sg_list_safe(
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ uint32_t flags, const int lb)
	+{
	+ struct mlx4_qp *mqp = to_mqp(ibqp);
	+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
	+ int wqe_64 = mqp->sq.wqe_shift == 6;
	+
	+ /* qp, addr, length, lkey, flags, raw_eth, inl, */
	+ return send_pending(ibqp, 0, 0, 0, flags, raw_eth, 0,
	+ /* safe, wqe_64, use_sg, num_sge, sg_list, lb */
	+ 1, wqe_64, 1, num, sg_list, lb);
	+}
	+static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags)
	+{
	+ return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 1);
	+}
	+
	+static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags)
	+{
	+ return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 0);
	+}
	+
	+#define MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_sg_list_unsafe_##eth##wqe64##lb
	+#define MLX4_SEND_PENDING_SG_LIST_UNSAFE(eth, wqe64, lb) \
	+ static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \
	+ static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num, uint32_t flags) \
	+ { \
	+ /* qp, addr, length, lkey, flags, eth, inl, */ \
	+ return send_pending(ibqp, 0, 0, 0, flags, eth, 0, \
	+ /* safe, wqe_64, use_sg, num_sge, sg_list, lb */ \
	+ 0, wqe64, 1, num, sg_list, lb); \
	+ }
	+/* eth, wqe64, lb */
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 0, 0);
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 0, 1);
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 1, 0);
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0, 1, 1);
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 0, 0);
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 0, 1);
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 1, 0);
	+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1, 1, 1);
	+
	+static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64) __attribute__((always_inline));
	+/* burst family - send_burst */
	+static inline int send_msg_list(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ uint32_t flags, const int raw_eth, const int thread_safe,
	+ const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb) __attribute__((always_inline));
	+static inline int send_msg_list(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ uint32_t flags, const int raw_eth, const int thread_safe,
	+ const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb)
	+{
	+ struct mlx4_qp *qp = to_mqp(ibqp);
	+ int i;
	+
	+ if (unlikely(thread_safe))
	+ mlx4_lock(&qp->sq.lock);
	+
	+ for (i = 0; i < num; i++, sg_list++)
	+ /* qp, addr, length, lkey, */
	+ send_pending(ibqp, sg_list->addr, sg_list->length, sg_list->lkey,
	+ /* flags, raw_eth, inl, safe, wqe_64, use_sg, */
	+ flags, raw_eth, 0, 0, wqe_64, 0,
	+ /* num_sge, sg_list, lb */
	+ 0, NULL, lb);
	+
	+ if (use_bf)
	+ /* use send_flush_unsafe since lock is already taken if needed */
	+ send_flush_unsafe(ibqp, _1thrd_evict, wqe_64);
	+ else
	+ *qp->sdb = qp->doorbell_qpn;
	+
	+ if (unlikely(thread_safe))
	+ mlx4_unlock(&qp->sq.lock);
	+
	+ return 0;
	+}
	+
	+static inline int mlx4_send_burst_safe(
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ uint32_t flags, const int lb) __attribute__((always_inline));
	+static inline int mlx4_send_burst_safe(
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ uint32_t flags, const int lb)
	+{
	+ struct mlx4_qp *mqp = to_mqp(ibqp);
	+ int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
	+ int wqe_64 = mqp->sq.wqe_shift == 6;
	+ int _1thrd_evict = mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB \|\|
	+ mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
	+ int use_bf = mqp->db_method != MLX4_QP_DB_METHOD_DB;
	+
	+ return send_msg_list(ibqp, sg_list, num, flags, raw_eth, 1, wqe_64, use_bf, _1thrd_evict, lb);
	+}
	+
	+static int mlx4_send_burst_safe_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_burst_safe_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags)
	+{
	+ return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 1);
	+}
	+
	+static int mlx4_send_burst_safe_no_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_burst_safe_no_lb(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num, uint32_t flags)
	+{
	+ return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 0);
	+}
	+
	+#define MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb) mlx4_send_burst_unsafe_##_1thrd_evict##eth##wqe64##lb
	+#define MLX4_SEND_BURST_UNSAFE(_1thrd_evict, eth, wqe64, lb) \
	+ static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \
	+ static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num, uint32_t flags) \
	+ { \
	+ return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 1, _1thrd_evict, \
	+ lb); \
	+ }
	+/* _1thrd_evict, eth, wqe64, lb */
	+MLX4_SEND_BURST_UNSAFE(0, 0, 0, 0);
	+MLX4_SEND_BURST_UNSAFE(0, 0, 0, 1);
	+MLX4_SEND_BURST_UNSAFE(0, 0, 1, 0);
	+MLX4_SEND_BURST_UNSAFE(0, 0, 1, 1);
	+MLX4_SEND_BURST_UNSAFE(0, 1, 0, 0);
	+MLX4_SEND_BURST_UNSAFE(0, 1, 0, 1);
	+MLX4_SEND_BURST_UNSAFE(0, 1, 1, 0);
	+MLX4_SEND_BURST_UNSAFE(0, 1, 1, 1);
	+MLX4_SEND_BURST_UNSAFE(1, 0, 0, 0);
	+MLX4_SEND_BURST_UNSAFE(1, 0, 0, 1);
	+MLX4_SEND_BURST_UNSAFE(1, 0, 1, 0);
	+MLX4_SEND_BURST_UNSAFE(1, 0, 1, 1);
	+MLX4_SEND_BURST_UNSAFE(1, 1, 0, 0);
	+MLX4_SEND_BURST_UNSAFE(1, 1, 0, 1);
	+MLX4_SEND_BURST_UNSAFE(1, 1, 1, 0);
	+MLX4_SEND_BURST_UNSAFE(1, 1, 1, 1);
	+
	+#define MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb) mlx4_send_burst_unsafe_##eth##wqe64##lb
	+#define MLX4_SEND_BURST_UNSAFE_DB(eth, wqe64, lb) \
	+ static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__; \
	+ static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num, uint32_t flags) \
	+ { \
	+ return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 0, 0, lb); \
	+ }
	+/* eth, wqe64, lb */
	+MLX4_SEND_BURST_UNSAFE_DB(0, 0, 0);
	+MLX4_SEND_BURST_UNSAFE_DB(0, 0, 1);
	+MLX4_SEND_BURST_UNSAFE_DB(0, 1, 0);
	+MLX4_SEND_BURST_UNSAFE_DB(0, 1, 1);
	+MLX4_SEND_BURST_UNSAFE_DB(1, 0, 0);
	+MLX4_SEND_BURST_UNSAFE_DB(1, 0, 1);
	+MLX4_SEND_BURST_UNSAFE_DB(1, 1, 0);
	+MLX4_SEND_BURST_UNSAFE_DB(1, 1, 1);
	+
	+/* burst family - send_flush */
	+static int mlx4_send_flush_db(struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__;
	+static int mlx4_send_flush_db(struct ibv_qp *ibqp)
	+{
	+ struct mlx4_qp *qp = to_mqp(ibqp);
	+
	+ *qp->sdb = qp->doorbell_qpn;
	+
	+ return 0;
	+}
	+
	+static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64)
	+{
	+ struct mlx4_qp *qp = to_mqp(ibqp);
	+
	+ if (qp->last_db_head + 1 == qp->sq.head) {
	+ struct mlx4_wqe_ctrl_seg *ctrl = get_send_wqe(qp, qp->last_db_head & (qp->sq.wqe_cnt - 1));
	+ int size = ctrl->fence_size & 0x3f;
	+
	+ /*
	+ * There is no need to check that size > 1 since we get here only
	+ * after using send_pending function, this guarantee that size > 1
	+ */
	+ if (wqe64)
	+ copy_wqe_to_bf(qp, ctrl, 64, qp->last_db_head,
	+ 1, _1thrd_evict);
	+ else if (size <= qp->bf_buf_size / 16)
	+ copy_wqe_to_bf(qp, ctrl, align(size * 16, 64),
	+ qp->last_db_head,
	+ 1, _1thrd_evict);
	+ else
	+ *qp->sdb = qp->doorbell_qpn;
	+ } else {
	+ *qp->sdb = qp->doorbell_qpn;
	+ }
	+ qp->last_db_head = qp->sq.head;
	+
	+ return 0;
	+}
	+
	+#define MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64) mlx4_send_flush_unsafe_##_1thrd_evict##wqe64
	+#define MLX4_SEND_FLUSH_UNSAFE(_1thrd_evict, wqe64) \
	+ static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)( \
	+ struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__; \
	+ static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)( \
	+ struct ibv_qp *ibqp) \
	+ { \
	+ return send_flush_unsafe(ibqp, _1thrd_evict, wqe64); \
	+ }
	+
	+/* _1thrd_evict, wqe64 */
	+MLX4_SEND_FLUSH_UNSAFE(0, 0);
	+MLX4_SEND_FLUSH_UNSAFE(1, 0);
	+MLX4_SEND_FLUSH_UNSAFE(0, 1);
	+MLX4_SEND_FLUSH_UNSAFE(1, 1);
	+
	+/* burst family - recv_burst */
	+static inline int recv_burst(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ const int thread_safe, const int use_inlne_recv, const int max_one_sge) __attribute__((always_inline));
	+static inline int recv_burst(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num,
	+ const int thread_safe, const int use_inlne_recv, const int max_one_sge)
	+{
	+ struct mlx4_qp *qp = to_mqp(ibqp);
	+ struct mlx4_wqe_data_seg *scat;
	+ struct mlx4_inlr_rbuff *rbuffs;
	+ unsigned int ind;
	+ int i;
	+
	+ if (thread_safe)
	+ mlx4_lock(&qp->rq.lock);
	+
	+ for (i = 0; i < num; ++i) {
	+ ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
	+ scat = get_recv_wqe(qp, ind);
	+ __set_data_seg(scat, sg_list);
	+
	+ if (!max_one_sge) {
	+ scat[1].byte_count = 0;
	+ scat[1].lkey = htonl(MLX4_INVALID_LKEY);
	+ scat[1].addr = 0;
	+ }
	+
	+ if (use_inlne_recv) {
	+ rbuffs = qp->inlr_buff.buff[ind].sg_list;
	+ qp->inlr_buff.buff[ind].list_len = 1;
	+ rbuffs->rbuff = (void *)(unsigned long)(sg_list->addr);
	+ rbuffs->rlen = sg_list->length;
	+ rbuffs++;
	+ }
	+ sg_list++;
	+ qp->rq.head++;
	+ }
	+
	+ /*
	+ * Make sure that descriptors are written before
	+ * doorbell record.
	+ */
	+ wmb();
	+
	+ *qp->db = htonl(qp->rq.head & 0xffff);
	+
	+ if (thread_safe)
	+ mlx4_unlock(&qp->rq.lock);
	+
	+ return 0;
	+}
	+
	+static int mlx4_recv_burst_safe(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num) __MLX4_ALGN_FUNC__;
	+static int mlx4_recv_burst_safe(struct ibv_qp ibqp, struct ibv_sge sg_list, uint32_t num)
	+{
	+ struct mlx4_qp *qp = to_mqp(ibqp);
	+
	+ return recv_burst(ibqp, sg_list, num, 1, qp->max_inlr_sg, qp->rq.max_gs == 1);
	+}
	+#define MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge) mlx4_recv_burst_unsafe_##inlr##_1sge
	+#define MLX4_RECV_BURST_UNSAFE(inlr, _1sge) \
	+ static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num) __MLX4_ALGN_FUNC__; \
	+ static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)( \
	+ struct ibv_qp ibqp, struct ibv_sge sg_list, \
	+ uint32_t num) \
	+ { \
	+ return recv_burst(ibqp, sg_list, num, 0, inlr, _1sge); \
	+ }
	+/* inlr, _1sge */
	+MLX4_RECV_BURST_UNSAFE(0, 0);
	+MLX4_RECV_BURST_UNSAFE(1, 0);
	+MLX4_RECV_BURST_UNSAFE(0, 1);
	+MLX4_RECV_BURST_UNSAFE(1, 1);
	+
	+/*
	+ * qp_burst family implementation for safe QP
	+ */
	+struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_lb = {
	+ .send_burst = mlx4_send_burst_safe_lb,
	+ .send_pending = mlx4_send_pending_safe_lb,
	+ .send_pending_inline = mlx4_send_pending_inl_safe_lb,
	+ .send_pending_sg_list = mlx4_send_pending_sg_list_safe_lb,
	+ .recv_burst = mlx4_recv_burst_safe,
	+ .send_flush = mlx4_send_flush_db
	+};
	+
	+struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_no_lb = {
	+ .send_burst = mlx4_send_burst_safe_no_lb,
	+ .send_pending = mlx4_send_pending_safe_no_lb,
	+ .send_pending_inline = mlx4_send_pending_inl_safe_no_lb,
	+ .send_pending_sg_list = mlx4_send_pending_sg_list_safe_no_lb,
	+ .recv_burst = mlx4_recv_burst_safe,
	+ .send_flush = mlx4_send_flush_db
	+};
	+
	+/*
	+ * qp_burst family implementation table for unsafe QP
	+ */
	+#define MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge) \
	+ (lb << 5 \| _1thrd_evict << 4 \| eth << 3 \| wqe64 << 2 \| inlr << 1 \| _1sge)
	+
	+#define MLX4_QP_BURST_UNSAFE_TBL_ENTRY(lb, _1thrd_evict, eth, wqe64, inlr, _1sge) \
	+ [MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)] = { \
	+ .send_burst = MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb), \
	+ .send_pending = MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb), \
	+ .send_pending_inline = MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb), \
	+ .send_pending_sg_list = MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb), \
	+ .recv_burst = MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge), \
	+ .send_flush = MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64), \
	+ }
	+static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_tbl[1 << 6] = {
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 1),
	+};
	+
	+#define MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge) \
	+ (lb << 4 \| eth << 3 \| wqe64 << 2 \| inlr << 1 \| _1sge)
	+
	+#define MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(lb, eth, wqe64, inlr, _1sge) \
	+ [MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)] = { \
	+ .send_burst = MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb), \
	+ .send_pending = MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb), \
	+ .send_pending_inline = MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb), \
	+ .send_pending_sg_list = MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb), \
	+ .recv_burst = MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge), \
	+ .send_flush = mlx4_send_flush_db, \
	+ }
	+static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_db_tbl[1 << 5] = {
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 1),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 0),
	+ MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 1),
	+};
	+
	+struct ibv_exp_qp_burst_family mlx4_get_qp_burst_family(struct mlx4_qp qp,
	+ struct ibv_exp_query_intf_params *params,
	+ enum ibv_exp_query_intf_status *status)
	+{
	+ enum ibv_exp_query_intf_status ret = IBV_EXP_INTF_STAT_OK;
	+ struct ibv_exp_qp_burst_family *family = NULL;
	+ uint32_t unsupported_f;
	+
	+ if ((qp->verbs_qp.qp.state < IBV_QPS_INIT) \|\| (qp->verbs_qp.qp.state > IBV_QPS_RTS)) {
	+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ_STATE;
	+ return NULL;
	+ }
	+
	+ if (params->flags) {
	+ fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for QP family\n", params->flags);
	+ *status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED;
	+
	+ return NULL;
	+ }
	+ unsupported_f = params->family_flags & ~(IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK \|
	+ IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR);
	+ if (unsupported_f) {
	+ fprintf(stderr, PFX "Family flags(0x%x) are not supported for QP family\n", unsupported_f);
	+ *status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED;
	+
	+ return NULL;
	+ }
	+
	+ switch (qp->qp_type) {
	+ case IBV_QPT_RC:
	+ case IBV_QPT_UC:
	+ case IBV_QPT_RAW_PACKET:
	+ if (qp->model_flags & MLX4_QP_MODEL_FLAG_THREAD_SAFE) {
	+ int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK);
	+
	+ if (lb)
	+ family = &mlx4_qp_burst_family_safe_lb;
	+ else
	+ family = &mlx4_qp_burst_family_safe_no_lb;
	+ } else {
	+ int eth = qp->qp_type == IBV_QPT_RAW_PACKET &&
	+ qp->link_layer == IBV_LINK_LAYER_ETHERNET;
	+ int wqe64 = qp->sq.wqe_shift == 6;
	+ int inlr = qp->max_inlr_sg != 0;
	+ int _1sge = qp->rq.max_gs == 1;
	+ int _1thrd_evict = qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB \|\|
	+ qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
	+ int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK);
	+
	+ if (qp->db_method == MLX4_QP_DB_METHOD_DB)
	+ family = &mlx4_qp_burst_family_unsafe_db_tbl
	+ [MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)];
	+ else
	+ family = &mlx4_qp_burst_family_unsafe_tbl
	+ [MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)];
	+ }
	+ break;
	+
	+ default:
	+ ret = IBV_EXP_INTF_STAT_INVAL_PARARM;
	+ break;
	+ }
	+
	+ *status = ret;
	+
	+ return family;
	+}
	Index: contrib/ofed/libmlx4/src/srq.c
	===================================================================
	--- contrib/ofed/libmlx4/src/srq.c
	+++ contrib/ofed/libmlx4/src/srq.c
	@@ -42,6 +42,7 @@
	#include "mlx4.h"
	#include "doorbell.h"
	#include "wqe.h"
	+#include "mlx4-abi.h"

	static void get_wqe(struct mlx4_srq srq, int n)
	{
	@@ -52,38 +53,43 @@
	{
	struct mlx4_wqe_srq_next_seg *next;

	- pthread_spin_lock(&srq->lock);
	+ mlx4_spin_lock(&srq->lock);

	next = get_wqe(srq, srq->tail);
	next->next_wqe_index = htons(ind);
	srq->tail = ind;

	- pthread_spin_unlock(&srq->lock);
	+ mlx4_spin_unlock(&srq->lock);
	}

	int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
	struct ibv_recv_wr *wr,
	struct ibv_recv_wr **bad_wr)
	{
	- struct mlx4_srq *srq = to_msrq(ibsrq);
	+ struct mlx4_srq *srq;
	struct mlx4_wqe_srq_next_seg *next;
	struct mlx4_wqe_data_seg *scat;
	int err = 0;
	int nreq;
	int i;

	- pthread_spin_lock(&srq->lock);
	+ if (ibsrq->handle == LEGACY_XRC_SRQ_HANDLE)
	+ ibsrq = (struct ibv_srq )(((struct ibv_srq_legacy ) ibsrq)->ibv_srq);

	+ srq = to_msrq(ibsrq);
	+ mlx4_spin_lock(&srq->lock);
	for (nreq = 0; wr; ++nreq, wr = wr->next) {
	if (wr->num_sge > srq->max_gs) {
	- err = -1;
	+ errno = EINVAL;
	+ err = errno;
	*bad_wr = wr;
	break;
	}

	if (srq->head == srq->tail) {
	/* SRQ is full*/
	- err = -1;
	+ errno = ENOMEM;
	+ err = errno;
	*bad_wr = wr;
	break;
	}
	@@ -119,7 +125,7 @@
	*srq->db = htonl(srq->counter);
	}

	- pthread_spin_unlock(&srq->lock);
	+ mlx4_spin_unlock(&srq->lock);

	return err;
	}
	@@ -174,52 +180,153 @@
	return 0;
	}

	-struct mlx4_srq mlx4_find_xrc_srq(struct mlx4_context ctx, uint32_t xrc_srqn)
	+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size)
	{
	- int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
	+ memset(xsrq_table, 0, sizeof *xsrq_table);
	+ xsrq_table->num_xsrq = size;
	+ xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS;
	+ xsrq_table->mask = (1 << xsrq_table->shift) - 1;

	- if (ctx->xrc_srq_table[tind].refcnt)
	- return ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask];
	- else
	- return NULL;
	+ pthread_mutex_init(&xsrq_table->mutex, NULL);
	}

	-int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
	- struct mlx4_srq *srq)
	+struct mlx4_srq mlx4_find_xsrq(struct mlx4_xsrq_table xsrq_table, uint32_t srqn)
	{
	- int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
	- int ret = 0;
	+ int index;

	- pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
	+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
	+ if (xsrq_table->xsrq_table[index].refcnt)
	+ return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask];
	+
	+ return NULL;
	+}

	- if (!ctx->xrc_srq_table[tind].refcnt) {
	- ctx->xrc_srq_table[tind].table = calloc(ctx->xrc_srq_table_mask + 1,
	- sizeof(struct mlx4_srq *));
	- if (!ctx->xrc_srq_table[tind].table) {
	+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
	+ struct mlx4_srq *srq)
	+{
	+ int index, ret = 0;
	+
	+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
	+ pthread_mutex_lock(&xsrq_table->mutex);
	+ if (!xsrq_table->xsrq_table[index].refcnt) {
	+ xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1,
	+ sizeof(struct mlx4_srq *));
	+ if (!xsrq_table->xsrq_table[index].table) {
	ret = -1;
	goto out;
	}
	}

	- ++ctx->xrc_srq_table[tind].refcnt;
	- ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = srq;
	+ xsrq_table->xsrq_table[index].refcnt++;
	+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq;

	out:
	- pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
	+ pthread_mutex_unlock(&xsrq_table->mutex);
	return ret;
	}

	-void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
	+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
	{
	- int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
	+ int index;

	- pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
	+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
	+ pthread_mutex_lock(&xsrq_table->mutex);

	- if (!--ctx->xrc_srq_table[tind].refcnt)
	- free(ctx->xrc_srq_table[tind].table);
	+ if (--xsrq_table->xsrq_table[index].refcnt)
	+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL;
	else
	- ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = NULL;
	+ free(xsrq_table->xsrq_table[index].table);
	+
	+ pthread_mutex_unlock(&xsrq_table->mutex);
	+}
	+
	+struct ibv_srq mlx4_create_xrc_srq(struct ibv_context context,
	+ struct ibv_srq_init_attr_ex *attr_ex)
	+{
	+ struct mlx4_create_xsrq cmd;
	+ struct mlx4_create_srq_resp resp;
	+ struct mlx4_srq *srq;
	+ int ret;

	- pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
	+ /* Sanity check SRQ size before proceeding */
	+ if (attr_ex->attr.max_wr > 1 << 16 \|\| attr_ex->attr.max_sge > 64)
	+ return NULL;
	+
	+ srq = calloc(1, sizeof *srq);
	+ if (!srq)
	+ return NULL;
	+
	+ if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded))
	+ goto err;
	+
	+ srq->max = align_queue_size(attr_ex->attr.max_wr + 1);
	+ srq->max_gs = attr_ex->attr.max_sge;
	+ srq->counter = 0;
	+ srq->ext_srq = 1;
	+
	+ if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq))
	+ goto err;
	+
	+ srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
	+ if (!srq->db)
	+ goto err_free;
	+
	+ *srq->db = 0;
	+
	+ cmd.buf_addr = (uintptr_t) srq->buf.buf;
	+ cmd.db_addr = (uintptr_t) srq->db;
	+
	+ ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, sizeof(srq->verbs_srq),
	+ attr_ex,
	+ &cmd.ibv_cmd, sizeof cmd,
	+ &resp.ibv_resp, sizeof resp);
	+ if (ret)
	+ goto err_db;
	+
	+ ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table,
	+ srq->verbs_srq.srq_num, srq);
	+ if (ret)
	+ goto err_destroy;
	+
	+ return &srq->verbs_srq.srq;
	+
	+err_destroy:
	+ ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
	+err_db:
	+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db);
	+err_free:
	+ free(srq->wrid);
	+ mlx4_free_buf(&srq->buf);
	+err:
	+ free(srq);
	+ return NULL;
	}

	+int mlx4_destroy_xrc_srq(struct ibv_srq *srq)
	+{
	+ struct mlx4_context *mctx = to_mctx(srq->context);
	+ struct mlx4_srq *msrq = to_msrq(srq);
	+ struct mlx4_cq *mcq;
	+ int ret;
	+
	+ mcq = to_mcq(msrq->verbs_srq.cq);
	+ mlx4_cq_clean(mcq, 0, msrq);
	+ mlx4_lock(&mcq->lock);
	+ mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num);
	+ mlx4_unlock(&mcq->lock);
	+
	+ ret = ibv_cmd_destroy_srq(srq);
	+ if (ret) {
	+ mlx4_lock(&mcq->lock);
	+ mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq);
	+ mlx4_unlock(&mcq->lock);
	+ return ret;
	+ }
	+
	+ mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db);
	+ mlx4_free_buf(&msrq->buf);
	+ free(msrq->wrid);
	+ free(msrq);
	+
	+ return 0;
	+}
	Index: contrib/ofed/libmlx4/src/verbs.c
	===================================================================
	--- contrib/ofed/libmlx4/src/verbs.c
	+++ contrib/ofed/libmlx4/src/verbs.c
	@@ -40,38 +40,130 @@
	#include <pthread.h>
	#include <errno.h>
	#include <netinet/in.h>
	-
	+#include <sys/types.h>
	+#include <sys/stat.h>
	+#include <fcntl.h>
	+#include <unistd.h>
	+/* Added for reg_mr mmap munmap system calls */
	+#include <sys/mman.h>
	+#include <sys/time.h>
	+#include <sched.h>
	+#include <glob.h>
	#include "mlx4.h"
	#include "mlx4-abi.h"
	+#include "mlx4_exp.h"
	#include "wqe.h"

	+#define SHARED_MR_PROC_DIR_NAME "/proc/driver/mlx4_ib/mrs"
	+#define FPATH_MAX 128
	+
	+int __mlx4_query_device(uint64_t raw_fw_ver,
	+ struct ibv_device_attr *attr)
	+{
	+ unsigned major, minor, sub_minor;
	+
	+ major = (raw_fw_ver >> 32) & 0xffff;
	+ minor = (raw_fw_ver >> 16) & 0xffff;
	+ sub_minor = raw_fw_ver & 0xffff;
	+
	+ snprintf(attr->fw_ver, sizeof attr->fw_ver,
	+ "%d.%d.%03d", major, minor, sub_minor);
	+
	+ return 0;
	+}
	+
	int mlx4_query_device(struct ibv_context context, struct ibv_device_attr attr)
	{
	struct ibv_query_device cmd;
	uint64_t raw_fw_ver;
	- unsigned major, minor, sub_minor;
	int ret;

	- ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
	+ read_init_vars(to_mctx(context));
	+ ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd,
	+ sizeof(cmd));
	if (ret)
	return ret;

	- major = (raw_fw_ver >> 32) & 0xffff;
	- minor = (raw_fw_ver >> 16) & 0xffff;
	- sub_minor = raw_fw_ver & 0xffff;
	+ return __mlx4_query_device(raw_fw_ver, attr);
	+}

	- snprintf(attr->fw_ver, sizeof attr->fw_ver,
	- "%d.%d.%03d", major, minor, sub_minor);
	+#define READL(ptr) (((uint32_t )(ptr)))
	+
	+static int mlx4_read_clock(struct ibv_context context, uint64_t cycles)
	+{
	+ unsigned int clockhi, clocklo, clockhi1;
	+ int i;
	+ struct mlx4_context *ctx = to_mctx(context);
	+
	+ if (ctx->hca_core_clock == NULL)
	+ return -EOPNOTSUPP;
	+
	+ for (i = 0; i < 10; i++) {
	+ clockhi = ntohl(READL(ctx->hca_core_clock));
	+ clocklo = ntohl(READL(ctx->hca_core_clock + 4));
	+ clockhi1 = ntohl(READL(ctx->hca_core_clock));
	+ if (clockhi == clockhi1)
	+ break;
	+ }
	+
	+ if (clocklo == 0)
	+ clockhi++;
	+
	+ *cycles = (uint64_t) clockhi << 32 \| (uint64_t) clocklo;

	return 0;
	}
	+int mlx4_query_values(struct ibv_context *context, int q_values,
	+ struct ibv_exp_values *values)
	+{
	+ struct mlx4_context *ctx = to_mctx(context);
	+ uint64_t cycles;
	+ int err;
	+ uint32_t comp_mask = values->comp_mask;
	+
	+ values->comp_mask = 0;
	+
	+ if (q_values & (IBV_EXP_VALUES_HW_CLOCK \| IBV_EXP_VALUES_HW_CLOCK_NS)) {
	+ err = mlx4_read_clock(context, &cycles);
	+ if (!err) {
	+ if (comp_mask & IBV_EXP_VALUES_HW_CLOCK) {
	+ values->hwclock = cycles;
	+ values->comp_mask \|= IBV_EXP_VALUES_HW_CLOCK;
	+ }
	+ if (q_values & IBV_EXP_VALUES_HW_CLOCK_NS) {
	+ if (comp_mask & IBV_EXP_VALUES_HW_CLOCK_NS) {
	+ values->hwclock_ns =
	+ ((uint64_t)values->hwclock *
	+ ctx->core_clk.mult)
	+ >> ctx->core_clk.shift;
	+ values->comp_mask \|= IBV_EXP_VALUES_HW_CLOCK_NS;
	+ }
	+ }
	+ }
	+ }
	+ return 0;
	+}

	int mlx4_query_port(struct ibv_context *context, uint8_t port,
	struct ibv_port_attr *attr)
	{
	struct ibv_query_port cmd;
	+ int err;
	+
	+ read_init_vars(to_mctx(context));
	+ err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
	+ if (!err && port <= MLX4_PORTS_NUM && port > 0) {
	+ struct mlx4_context *mctx = to_mctx(context);
	+ if (!mctx->port_query_cache[port - 1].valid) {
	+ mctx->port_query_cache[port - 1].link_layer =
	+ attr->link_layer;
	+ mctx->port_query_cache[port - 1].caps =
	+ attr->port_cap_flags;
	+ mctx->port_query_cache[port - 1].valid = 1;
	+ }
	+ }

	- return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
	+ return err;
	}

	struct ibv_pd mlx4_alloc_pd(struct ibv_context context)
	@@ -80,6 +172,7 @@
	struct mlx4_alloc_pd_resp resp;
	struct mlx4_pd *pd;

	+ read_init_vars(to_mctx(context));
	pd = malloc(sizeof *pd);
	if (!pd)
	return NULL;
	@@ -107,50 +200,570 @@
	return 0;
	}

	-struct ibv_mr mlx4_reg_mr(struct ibv_pd pd, void *addr, size_t length,
	- enum ibv_access_flags access)
	+
	+static void mlx4_free_mr(struct mlx4_mr *mlx4_mr)
	+{
	+ /* mr address was allocated in speical mode - freed accordingly */
	+ if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR \|\|
	+ mlx4_mr->shared_mr)
	+ mlx4_free_buf(&(mlx4_mr->buf));
	+
	+ /* Finally we free the structure itself */
	+ free(mlx4_mr);
	+}
	+
	+
	+static void mlx4_get_contiguous_alloc_fallback(struct mlx4_buf buf,
	+ struct ibv_pd *pd, size_t length)
	+{
	+
	+ /* We allocate as fallback mode non contiguous pages*/
	+ if (mlx4_alloc_buf(
	+ buf,
	+ align(length, to_mdev(pd->context->device)->page_size),
	+ to_mdev(pd->context->device)->page_size))
	+ return NULL;
	+
	+ return buf->buf;
	+}
	+
	+
	+/* We'll call mmap on mlx4_ib module to achieve this task */
	+static void mlx4_get_contiguous_alloc(struct mlx4_buf mlx4_buf,
	+ struct ibv_pd *pd,
	+ size_t length,
	+ void *contig_addr)
	+{
	+ size_t alloc_length;
	+ int page_size;
	+ int mr_no_allocator = 0;
	+ int mr_force_contig_pages = 0;
	+ enum mlx4_alloc_type alloc_type;
	+
	+ mlx4_get_alloc_type(pd->context, MLX4_MR_PREFIX, &alloc_type,
	+ MLX4_ALLOC_TYPE_ALL);
	+
	+ if (alloc_type == MLX4_ALLOC_TYPE_CONTIG)
	+ mr_force_contig_pages = 1;
	+ else if (alloc_type == MLX4_ALLOC_TYPE_ANON)
	+ mr_no_allocator = 1;
	+
	+ /* For benchmarking purposes we apply an option to turn off continuous
	+ allocator based on environment variable
	+ */
	+ if (mr_no_allocator)
	+ return mlx4_get_contiguous_alloc_fallback(mlx4_buf, pd,
	+ length);
	+
	+ page_size = to_mdev(pd->context->device)->page_size;
	+ alloc_length = (contig_addr ? length : align(length, page_size));
	+ if (!(mlx4_alloc_buf_contig(to_mctx(pd->context),
	+ mlx4_buf, alloc_length,
	+ page_size, MLX4_MR_PREFIX, contig_addr)))
	+ return contig_addr ? contig_addr : mlx4_buf->buf;
	+
	+ if (mr_force_contig_pages \|\| contig_addr)
	+ return NULL;
	+
	+ return mlx4_get_contiguous_alloc_fallback(mlx4_buf,
	+ pd, length);
	+
	+}
	+
	+static int mlx4_get_shared_mr_name(char in_pattern, char file_name)
	+{
	+ glob_t results;
	+ int ret;
	+
	+ ret = glob(in_pattern, 0, NULL, &results);
	+
	+ if (ret) {
	+ if (mlx4_trace)
	+ /* might be some legacy kernel with old mode */
	+ fprintf(stderr, "mlx4_get_shared_mr_name: glob failed for %s, ret=%d, errno=%d\n",
	+ in_pattern, ret, errno);
	+ return ret;
	+ }
	+
	+ if (results.gl_pathc > 1) {
	+ int i;
	+ int duplicate_name = 1;
	+
	+ /* we encountered an issue where glob retuned same name twice, we suspect it to be
	+ * an issue with glob/procfs. When there is more than one entry check whether all entries
	+ * are the same in that case API succeeded and we use first entry name.
	+ */
	+ for (i = 1; i < results.gl_pathc; i++) {
	+ if (strcmp(results.gl_pathv[0], results.gl_pathv[i])) {
	+ duplicate_name = 0;
	+ break;
	+ }
	+ }
	+
	+ if (!duplicate_name) {
	+ fprintf(stderr, "mlx4_get_shared_mr_name failed for %s, unexpected %lu paths were found\n",
	+ in_pattern, (unsigned long)(results.gl_pathc));
	+ for (i = 0; i < results.gl_pathc; i++)
	+ fprintf(stderr, "mlx4_get_shared_mr_name: path#%d=%s\n", i,
	+ results.gl_pathv[i]);
	+ globfree(&results);
	+ return -EINVAL;
	+ }
	+ }
	+
	+ strncpy(file_name, results.gl_pathv[0], FPATH_MAX);
	+ file_name[FPATH_MAX - 1] = '\0';
	+ globfree(&results);
	+ return 0;
	+}
	+
	+struct ibv_mr mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in in)
	+{
	+ struct ibv_context *context;
	+ size_t total_size;
	+ int page_size;
	+ char shared_mr_file_name[FPATH_MAX];
	+ char shared_mr_pattern[FPATH_MAX];
	+ int fd;
	+ struct stat buffer;
	+ int status;
	+ struct ibv_mr *ibv_mr;
	+ uint64_t shared_flags;
	+ struct mlx4_mr *mlx4_mr = NULL;
	+ void *addr = in->addr;
	+ uint64_t access = in->exp_access;
	+ struct ibv_exp_reg_mr_in rmr_in;
	+ int flags;
	+ int ret;
	+ int is_writeable_mr = !!(access & (IBV_EXP_ACCESS_REMOTE_WRITE \|
	+ IBV_EXP_ACCESS_LOCAL_WRITE \| IBV_EXP_ACCESS_REMOTE_ATOMIC));
	+
	+ context = in->pd->context;
	+ page_size = to_mdev(context->device)->page_size;
	+ sprintf(shared_mr_pattern, "%s/%X.*",
	+ SHARED_MR_PROC_DIR_NAME, in->mr_handle);
	+
	+ ret = mlx4_get_shared_mr_name(shared_mr_pattern, shared_mr_file_name);
	+ if (ret)
	+ /* For compatability issue trying with legacy name */
	+ sprintf(shared_mr_file_name, "%s/%X",
	+ SHARED_MR_PROC_DIR_NAME, in->mr_handle);
	+
	+ flags = is_writeable_mr ? O_RDWR : O_RDONLY;
	+ fd = open(shared_mr_file_name, flags);
	+ if (fd < 0) {
	+ int counter = 10;
	+ /* retrying for 1 second before reporting an error */
	+ while (fd < 0 && counter > 0) {
	+ usleep(100000);
	+ counter--;
	+ fd = open(shared_mr_file_name, flags);
	+ }
	+
	+ if (fd < 0) {
	+ fprintf(stderr, "mlx4_reg_shared_mr failed open %s errno=%d\n",
	+ shared_mr_file_name, errno);
	+ return NULL;
	+ }
	+ }
	+
	+ status = fstat(fd, &buffer);
	+ if (status) {
	+ fprintf(stderr,
	+ "mlx4_reg_shared_mr lstat has failed , errno=%d\n",
	+ errno);
	+ goto error;
	+ }
	+
	+ total_size = align(buffer.st_size, page_size);
	+
	+ /* set protection based on access flags input address may be NULL
	+ or other recommended address by the application.
	+ */
	+ addr = mmap(addr , total_size,
	+ is_writeable_mr ? (PROT_WRITE \| PROT_READ) :
	+ PROT_READ, MAP_SHARED,
	+ fd,
	+ 0);
	+
	+ /* On a failure MAP_FAILED (that is, (void ) -1) is returned/
	+ if (addr == MAP_FAILED) {
	+ fprintf(stderr,
	+ "mlx4_reg_shared_mr mmap has failed , errno=%d\n",
	+ errno);
	+ goto error;
	+ }
	+
	+ if (ibv_dontfork_range(addr, total_size)) {
	+ fprintf(stderr,
	+ "mlx4_reg_shared_mr dontfork has failed , errno=%d\n",
	+ errno);
	+ goto err_unmap;
	+ }
	+
	+ if (access & IBV_EXP_ACCESS_NO_RDMA) {
	+ mlx4_mr = calloc(1, sizeof *mlx4_mr);
	+ if (!mlx4_mr)
	+ goto err_dofork;
	+
	+ mlx4_mr->allocation_flags \|= IBV_EXP_ACCESS_NO_RDMA;
	+ ibv_mr = &(mlx4_mr->ibv_mr);
	+ ibv_mr->context = in->pd->context;
	+
	+ } else {
	+ /* Make sure that shared access flags are off before
	+ calling to reg_mr, otherwise new mr will be shared as well.
	+ */
	+ shared_flags = IBV_EXP_ACCESS_SHARED_MR_USER_READ \|
	+ IBV_EXP_ACCESS_SHARED_MR_USER_WRITE \|
	+ IBV_EXP_ACCESS_SHARED_MR_GROUP_READ \|
	+ IBV_EXP_ACCESS_SHARED_MR_GROUP_WRITE \|
	+ IBV_EXP_ACCESS_SHARED_MR_OTHER_READ \|
	+ IBV_EXP_ACCESS_SHARED_MR_OTHER_WRITE;
	+
	+ access &= ~shared_flags;
	+ rmr_in.pd = in->pd;
	+ rmr_in.addr = addr;
	+ rmr_in.length = total_size;
	+ rmr_in.exp_access = access;
	+ rmr_in.comp_mask = 0;
	+
	+ ibv_mr = mlx4_exp_reg_mr(&rmr_in);
	+ if (!ibv_mr)
	+ goto err_dofork;
	+ }
	+
	+ /* file should be closed - not required any more */
	+ close(fd);
	+
	+ ibv_mr->length = total_size;
	+ ibv_mr->addr = addr;
	+ mlx4_mr = to_mmr(ibv_mr);
	+ /* We mark this MR as shared one to be handled correctly via dereg_mr*/
	+ mlx4_mr->shared_mr = 1;
	+ /* We hook addr & length also internally for further
	+ use via dreg_mr.
	+ */
	+ mlx4_mr->buf.buf = addr;
	+ mlx4_mr->buf.length = total_size;
	+ return ibv_mr;
	+
	+err_dofork:
	+ ibv_dofork_range(addr, total_size);
	+err_unmap:
	+ munmap(addr, total_size);
	+error:
	+ close(fd);
	+ return NULL;
	+}
	+
	+int mlx4_exp_dereg_mr(struct ibv_mr mr, struct ibv_exp_dereg_out out)
	+{
	+ struct mlx4_mr *mlx4_mr = to_mmr(mr);
	+
	+ out->need_dofork = (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR \|\|
	+ mlx4_mr->shared_mr) ? 0 : 1;
	+
	+ return mlx4_dereg_mr(mr);
	+}
	+
	+int mlx4_exp_rereg_mr(struct ibv_mr *mr,
	+ int flags,
	+ struct ibv_pd pd, void addr,
	+ size_t length, uint64_t access,
	+ struct ibv_exp_rereg_mr_attr *attr,
	+ struct ibv_exp_rereg_out *out)
	+{
	+ struct mlx4_mr *mlx4_mr = to_mmr(mr);
	+ struct mlx4_buf buf;
	+ struct ibv_exp_rereg_mr cmd;
	+ struct ibv_exp_rereg_mr_resp resp;
	+ int internal_alloc = 0;
	+ int ret;
	+
	+ if (flags & (~IBV_EXP_REREG_MR_FLAGS_SUPPORTED \| IBV_EXP_REREG_MR_KEEP_VALID))
	+ return -EINVAL;
	+
	+ /* Currently, we don't support any features in comp_mask */
	+ if (attr->comp_mask)
	+ return -EINVAL;
	+
	+ /* Here we check whether contigous pages are required and
	+ should be allocated internally.
	+ */
	+
	+ memset(&buf, 0, sizeof(buf));
	+ if ((flags & IBV_EXP_REREG_MR_CHANGE_ACCESS) &&
	+ !addr && (access & IBV_EXP_ACCESS_ALLOCATE_MR)) {
	+ struct ibv_pd *curr_pd = flags & IBV_EXP_REREG_MR_CHANGE_PD ? pd : mr->pd;
	+ addr = mlx4_get_contiguous_alloc(&buf, curr_pd, length, NULL);
	+ if (!addr)
	+ return -ENOMEM;
	+
	+ internal_alloc = 1;
	+ }
	+
	+ ret = ibv_exp_cmd_rereg_mr(mr, flags, addr, length,
	+ (uintptr_t) addr,
	+ access, pd, attr,
	+ &cmd, sizeof(cmd), 0,
	+ &resp, sizeof(resp), 0);
	+
	+ if (ret) {
	+ if (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION)
	+ mlx4_free_buf(&buf);
	+ return ret;
	+ } else {
	+ if (((mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR) \|\|
	+ mlx4_mr->shared_mr) &&
	+ (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION)) {
	+ mlx4_mr->shared_mr = 0;
	+ mlx4_free_buf(&(mlx4_mr->buf));
	+ /* The memory was just freed, mark it as NULL */
	+ mlx4_mr->ibv_mr.addr = NULL;
	+ mlx4_mr->allocation_flags &= ~IBV_EXP_ACCESS_ALLOCATE_MR;
	+ out->need_dofork = 0;
	+ }
	+ if (internal_alloc) {
	+ mlx4_mr->allocation_flags \|= IBV_EXP_ACCESS_ALLOCATE_MR;
	+ /* Address is returned to libibverbs through pointer to
	+ * pointer mechanism
	+ */
	+ mlx4_mr->ibv_mr.addr = addr;
	+ mlx4_mr->ibv_mr.length = length;
	+ memcpy(&mlx4_mr->buf, &buf, sizeof(mlx4_mr->buf));
	+ }
	+ }
	+
	+ return ret;
	+}
	+
	+
	+struct ibv_xrcd mlx4_open_xrcd(struct ibv_context context,
	+ struct ibv_xrcd_init_attr *attr)
	+{
	+ struct ibv_open_xrcd cmd;
	+ struct ibv_open_xrcd_resp resp;
	+ struct verbs_xrcd *xrcd;
	+ int ret;
	+
	+ xrcd = calloc(1, sizeof *xrcd);
	+ if (!xrcd)
	+ return NULL;
	+
	+ ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
	+ &cmd, sizeof cmd, &resp, sizeof resp);
	+ if (ret)
	+ goto err;
	+
	+ return &xrcd->xrcd;
	+
	+err:
	+ free(xrcd);
	+ return NULL;
	+}
	+
	+int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
	{
	- struct ibv_mr *mr;
	+ struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
	+ int ret;
	+
	+ ret = ibv_cmd_close_xrcd(xrcd);
	+ if (!ret)
	+ free(xrcd);
	+
	+ return ret;
	+}
	+
	+struct ibv_mr mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in in)
	+{
	+
	+ struct mlx4_mr *mlx4_mr;
	struct ibv_reg_mr cmd;
	int ret;
	+ int cmd_access;
	+ int is_contig;
	+
	+ if ((in->comp_mask > IBV_EXP_REG_MR_RESERVED - 1) \|\|
	+ (in->exp_access > IBV_EXP_ACCESS_RESERVED - 1)) {
	+ errno = EINVAL;
	+ return NULL;
	+ }

	- mr = malloc(sizeof *mr);
	- if (!mr)
	+ mlx4_mr = calloc(1, sizeof *mlx4_mr);
	+ if (!mlx4_mr)
	return NULL;

	+ VALGRIND_MAKE_MEM_DEFINED(&in->create_flags, sizeof(in->create_flags));
	+ is_contig = ((in->exp_access & IBV_EXP_ACCESS_ALLOCATE_MR) && !in->addr) \|\|
	+ ((in->comp_mask & IBV_EXP_REG_MR_CREATE_FLAGS) &&
	+ (in->create_flags & IBV_EXP_REG_MR_CREATE_CONTIG));
	+ /* Here we check whether contigous pages are required and
	+ should be allocated internally.
	+ */
	+ if (is_contig) {
	+ in->addr = mlx4_get_contiguous_alloc(&mlx4_mr->buf, in->pd,
	+ in->length, in->addr);
	+ if (!in->addr) {
	+ free(mlx4_mr);
	+ return NULL;
	+ }
	+
	+ mlx4_mr->allocation_flags \|= IBV_EXP_ACCESS_ALLOCATE_MR;
	+ /* Hooking the addr on returned pointer for
	+ further use by application.
	+ */
	+ mlx4_mr->ibv_mr.addr = in->addr;
	+ }
	+
	+ cmd_access = (in->exp_access & (IBV_EXP_START_FLAG - 1)) \|
	+ (in->exp_access & (IBV_EXP_ACCESS_RESERVED - 1)) >> IBV_EXP_START_FLAG_LOC;
	#ifdef IBV_CMD_REG_MR_HAS_RESP_PARAMS
	{
	struct ibv_reg_mr_resp resp;

	- ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
	- access, mr, &cmd, sizeof cmd,
	- &resp, sizeof resp);
	+ ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length,
	+ (uintptr_t) in->addr, cmd_access,
	+ &(mlx4_mr->ibv_mr),
	+ &cmd, sizeof(cmd),
	+ &resp, sizeof(resp));
	}
	#else
	- ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, access, mr,
	- &cmd, sizeof cmd);
	+ ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length,
	+ (uintptr_t) in->addr, cmd_access,
	+ &(mlx4_mr->ibv_mr),
	+ &cmd, sizeof(cmd));
	#endif
	if (ret) {
	- free(mr);
	+ mlx4_free_mr(mlx4_mr);
	return NULL;
	}

	- return mr;
	+ return &(mlx4_mr->ibv_mr);
	+}
	+
	+struct ibv_mr mlx4_reg_mr(struct ibv_pd pd, void *addr,
	+ size_t length, int access)
	+{
	+ struct ibv_exp_reg_mr_in in;
	+
	+ in.pd = pd;
	+ in.addr = addr;
	+ in.length = length;
	+ in.exp_access = access;
	+ in.comp_mask = 0;
	+
	+ return mlx4_exp_reg_mr(&in);
	}

	int mlx4_dereg_mr(struct ibv_mr *mr)
	{
	int ret;
	+ struct mlx4_mr *mlx4_mr = to_mmr(mr);
	+
	+ if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_NO_RDMA)
	+ goto free_mr;

	ret = ibv_cmd_dereg_mr(mr);
	if (ret)
	return ret;
	+free_mr:
	+ mlx4_free_mr(mlx4_mr);
	+ return 0;
	+}
	+
	+struct ibv_mw mlx4_alloc_mw(struct ibv_pd pd, enum ibv_mw_type type)
	+{
	+ struct verbs_mw *vmw;
	+ struct ibv_alloc_mw cmd;
	+ struct ibv_alloc_mw_resp resp;
	+ int ret;
	+
	+ vmw = malloc(sizeof(*vmw));
	+ if (!vmw)
	+ return NULL;
	+ memset(vmw, 0, sizeof(*vmw));
	+
	+ ret = ibv_cmd_alloc_mw(pd, type, vmw, &cmd, sizeof(cmd),
	+ &resp, sizeof(resp));
	+
	+ if (ret) {
	+ free(vmw);
	+ return NULL;
	+ }
	+ vmw->type = type;
	+
	+ return &vmw->mw;
	+}
	+
	+int mlx4_dealloc_mw(struct ibv_mw *mw)
	+{
	+ int ret;
	+ struct ibv_dealloc_mw cmd;
	+ struct verbs_mw vmw = (struct verbs_mw )mw;
	+
	+ ret = ibv_cmd_dealloc_mw(vmw, &cmd, sizeof(cmd));
	+ if (ret)
	+ return ret;
	+
	+ free(vmw);
	+ return 0;
	+}
	+
	+int __mlx4_bind_mw(struct ibv_exp_mw_bind *mw_bind)
	+{
	+ int ret;
	+ struct ibv_exp_send_wr *bad_wr = NULL;
	+ struct ibv_exp_send_wr wr = { };
	+
	+ wr.exp_opcode = IBV_EXP_WR_BIND_MW;
	+ wr.next = NULL;
	+
	+ wr.wr_id = mw_bind->wr_id;
	+ wr.exp_send_flags = mw_bind->exp_send_flags;
	+
	+ wr.bind_mw.mw = mw_bind->mw;
	+ wr.bind_mw.rkey = ibv_inc_rkey(mw_bind->mw->rkey);
	+ wr.bind_mw.bind_info = mw_bind->bind_info;
	+
	+ ret = mlx4_exp_post_send(mw_bind->qp, &wr, &bad_wr);
	+
	+ if (ret)
	+ return ret;
	+
	+ /* updating the mw with the latest rkey. */
	+ mw_bind->mw->rkey = wr.bind_mw.rkey;

	- free(mr);
	return 0;
	}

	-static int align_queue_size(int req)
	+int mlx4_bind_mw(struct ibv_qp qp, struct ibv_mw mw,
	+ struct ibv_mw_bind *mw_bind)
	+{
	+ struct ibv_exp_mw_bind exp_mw_bind;
	+
	+ memset(&exp_mw_bind, 0, sizeof(exp_mw_bind));
	+ exp_mw_bind.qp = qp;
	+ exp_mw_bind.exp_send_flags = mw_bind->send_flags;
	+ exp_mw_bind.wr_id = mw_bind->wr_id;
	+ exp_mw_bind.bind_info.addr = (uint64_t)(uintptr_t)mw_bind->addr;
	+ exp_mw_bind.bind_info.length = mw_bind->length;
	+ exp_mw_bind.bind_info.mr = mw_bind->mr;
	+ exp_mw_bind.bind_info.exp_mw_access_flags = mw_bind->mw_access_flags;
	+ exp_mw_bind.comp_mask = 0;
	+
	+ return __mlx4_bind_mw(&exp_mw_bind);
	+
	+}
	+
	+int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind)
	+{
	+ if (mw_bind->comp_mask > IBV_EXP_BIND_MW_RESERVED - 1)
	+ return EINVAL;
	+ return __mlx4_bind_mw(mw_bind);
	+}
	+
	+int align_queue_size(int req)
	{
	int nent;

	@@ -160,36 +773,52 @@
	return nent;
	}

	-struct ibv_cq mlx4_create_cq(struct ibv_context context, int cqe,
	- struct ibv_comp_channel *channel,
	- int comp_vector)
	+static struct ibv_cq create_cq(struct ibv_context context,
	+ int cqe,
	+ struct ibv_comp_channel *channel,
	+ int comp_vector,
	+ struct ibv_exp_cq_init_attr *attr)
	{
	- struct mlx4_create_cq cmd;
	- struct mlx4_create_cq_resp resp;
	- struct mlx4_cq *cq;
	- int ret;
	- struct mlx4_context *mctx = to_mctx(context);
	+ struct mlx4_create_cq cmd;
	+ struct mlx4_exp_create_cq cmd_e;
	+ struct mlx4_create_cq_resp resp;
	+ struct mlx4_cq *cq;
	+ int ret;
	+ struct mlx4_context *mctx = to_mctx(context);
	+ int thread_safe;

	/* Sanity check CQ size before proceeding */
	if (cqe > 0x3fffff)
	return NULL;

	- cq = malloc(sizeof *cq);
	+ cq = calloc(1, sizeof(*cq));
	if (!cq)
	return NULL;

	cq->cons_index = 0;
	+ cq->wait_index = 0;
	+ cq->wait_count = 0;

	- if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
	+ thread_safe = !mlx4_single_threaded;
	+ if (attr && (attr->comp_mask & IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN)) {
	+ if (!attr->res_domain) {
	+ errno = EINVAL;
	+ goto err;
	+ }
	+ thread_safe = (to_mres_domain(attr->res_domain)->attr.thread_model == IBV_EXP_THREAD_SAFE);
	+ }
	+
	+ if (mlx4_lock_init(&cq->lock, thread_safe, mlx4_get_locktype()))
	goto err;

	+ cq->model_flags = thread_safe ? MLX4_CQ_MODEL_FLAG_THREAD_SAFE : 0;
	+
	cqe = align_queue_size(cqe + 1);

	- if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe, mctx->cqe_size))
	+ if (mlx4_alloc_cq_buf(to_mctx(context), &cq->buf, cqe, mctx->cqe_size))
	goto err;

	cq->cqe_size = mctx->cqe_size;
	-
	cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
	if (!cq->set_ci_db)
	goto err_buf;
	@@ -199,16 +828,41 @@
	cq->arm_sn = 1;
	*cq->set_ci_db = 0;

	- cmd.buf_addr = (uintptr_t) cq->buf.buf;
	- cmd.db_addr = (uintptr_t) cq->set_ci_db;
	-
	- ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector,
	- &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd,
	- &resp.ibv_resp, sizeof resp);
	+ if (NULL != attr) {
	+ cmd_e.buf_addr = (uintptr_t) cq->buf.buf;
	+ cmd_e.db_addr = (uintptr_t) cq->set_ci_db;
	+ } else {
	+ cmd.buf_addr = (uintptr_t) cq->buf.buf;
	+ cmd.db_addr = (uintptr_t) cq->set_ci_db;
	+ }
	+ if (NULL != attr) {
	+ ret = ibv_exp_cmd_create_cq(context, cqe - 1, channel,
	+ comp_vector, &cq->ibv_cq,
	+ &cmd_e.ibv_cmd,
	+ sizeof(cmd_e.ibv_cmd),
	+ sizeof(cmd_e) - sizeof(cmd_e.ibv_cmd),
	+ &resp.ibv_resp,
	+ sizeof(resp.ibv_resp),
	+ sizeof(resp) - sizeof(resp.ibv_resp),
	+ attr);
	+ } else {
	+ ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector,
	+ &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd,
	+ &resp.ibv_resp, sizeof(resp));
	+ }
	if (ret)
	goto err_db;

	cq->cqn = resp.cqn;
	+ cq->stall_next_poll = 0;
	+ cq->stall_enable = mctx->stall_enable;
	+ if (NULL != attr && attr->comp_mask) {
	+ if (cmd_e.ibv_cmd.comp_mask & IBV_EXP_CREATE_CQ_CAP_FLAGS) {
	+ cq->creation_flags = attr->flags;
	+ }
	+ }
	+
	+ cq->pattern = MLX4_CQ_PATTERN;

	return &cq->ibv_cq;

	@@ -216,14 +870,41 @@
	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);

	err_buf:
	- mlx4_free_buf(&cq->buf);
	-
	+ if (cq->buf.hmem != NULL)
	+ mlx4_free_buf_huge(to_mctx(context), &cq->buf);
	+ else
	+ mlx4_free_buf(&cq->buf);
	err:
	free(cq);

	return NULL;
	}

	+struct ibv_cq mlx4_create_cq(struct ibv_context context, int cqe,
	+ struct ibv_comp_channel *channel,
	+ int comp_vector)
	+{
	+ read_init_vars(to_mctx(context));
	+ return create_cq(context, cqe, channel, comp_vector, NULL);
	+}
	+
	+struct ibv_cq mlx4_create_cq_ex(struct ibv_context context,
	+ int cqe,
	+ struct ibv_comp_channel *channel,
	+ int comp_vector,
	+ struct ibv_exp_cq_init_attr *attr)
	+{
	+ return create_cq(context, cqe, channel, comp_vector, attr);
	+}
	+
	+int mlx4_modify_cq(struct ibv_cq *cq,
	+ struct ibv_exp_cq_attr *attr,
	+ int attr_mask)
	+{
	+ struct ibv_exp_modify_cq cmd;
	+ return ibv_exp_cmd_modify_cq(cq, attr, attr_mask, &cmd, sizeof(cmd));
	+}
	+
	int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
	{
	struct mlx4_cq *cq = to_mcq(ibcq);
	@@ -235,7 +916,7 @@
	if (cqe > 0x3fffff)
	return EINVAL;

	- pthread_spin_lock(&cq->lock);
	+ mlx4_lock(&cq->lock);

	cqe = align_queue_size(cqe + 1);
	if (cqe == ibcq->cqe + 1) {
	@@ -250,7 +931,7 @@
	goto out;
	}

	- ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe,
	+ ret = mlx4_alloc_cq_buf(to_mctx(ibcq->context), &buf, cqe,
	cq->cqe_size);
	if (ret)
	goto out;
	@@ -268,17 +949,24 @@
	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
	#endif
	if (ret) {
	- mlx4_free_buf(&buf);
	+ if (cq->buf.hmem != NULL)
	+ mlx4_free_buf_huge(to_mctx(ibcq->context), &buf);
	+ else
	+ mlx4_free_buf(&buf);
	goto out;
	}

	mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);

	- mlx4_free_buf(&cq->buf);
	- cq->buf = buf;
	+ if (cq->buf.hmem != NULL)
	+ mlx4_free_buf_huge(to_mctx(ibcq->context), &cq->buf);
	+ else
	+ mlx4_free_buf(&cq->buf);

	+ cq->buf = buf;
	+ mlx4_update_cons_index(cq);
	out:
	- pthread_spin_unlock(&cq->lock);
	+ mlx4_unlock(&cq->lock);
	return ret;
	}

	@@ -291,14 +979,32 @@
	return ret;

	mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
	- mlx4_free_buf(&to_mcq(cq)->buf);
	+ if (to_mcq(cq)->buf.hmem != NULL)
	+ mlx4_free_buf_huge(to_mctx(cq->context), &to_mcq(cq)->buf);
	+ else
	+ mlx4_free_buf(&to_mcq(cq)->buf);
	free(to_mcq(cq));

	return 0;
	}

	+void mlx4_get_legacy_xrc(struct ibv_srq srq)
	+{
	+ struct mlx4_srq *msrq = to_msrq(srq);
	+
	+ return msrq->ibv_srq_legacy;
	+}
	+
	+void mlx4_set_legacy_xrc(struct ibv_srq srq, void legacy_xrc_srq)
	+{
	+ struct mlx4_srq *msrq = to_msrq(srq);
	+
	+ msrq->ibv_srq_legacy = legacy_xrc_srq;
	+ return;
	+}
	+
	struct ibv_srq mlx4_create_srq(struct ibv_pd pd,
	- struct ibv_srq_init_attr *attr)
	+ struct ibv_srq_init_attr *attr)
	{
	struct mlx4_create_srq cmd;
	struct mlx4_create_srq_resp resp;
	@@ -309,16 +1015,17 @@
	if (attr->attr.max_wr > 1 << 16 \|\| attr->attr.max_sge > 64)
	return NULL;

	- srq = malloc(sizeof *srq);
	+ srq = calloc(1, sizeof *srq);
	if (!srq)
	return NULL;

	- if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
	+ if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded))
	goto err;

	srq->max = align_queue_size(attr->attr.max_wr + 1);
	srq->max_gs = attr->attr.max_sge;
	srq->counter = 0;
	+ srq->ext_srq = 0;

	if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
	goto err;
	@@ -332,15 +1039,13 @@
	cmd.buf_addr = (uintptr_t) srq->buf.buf;
	cmd.db_addr = (uintptr_t) srq->db;

	- ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr,
	+ ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
	&cmd.ibv_cmd, sizeof cmd,
	&resp.ibv_resp, sizeof resp);
	if (ret)
	goto err_db;

	- srq->srqn = resp.srqn;
	-
	- return &srq->ibv_srq;
	+ return &srq->verbs_srq.srq;

	err_db:
	mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
	@@ -355,12 +1060,27 @@
	return NULL;
	}

	+struct ibv_srq mlx4_create_srq_ex(struct ibv_context context,
	+ struct ibv_srq_init_attr_ex *attr_ex)
	+{
	+ if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) \|\|
	+ (attr_ex->srq_type == IBV_SRQT_BASIC))
	+ return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
	+ else if (attr_ex->srq_type == IBV_SRQT_XRC)
	+ return mlx4_create_xrc_srq(context, attr_ex);
	+
	+ return NULL;
	+}
	+
	int mlx4_modify_srq(struct ibv_srq *srq,
	struct ibv_srq_attr *attr,
	- enum ibv_srq_attr_mask attr_mask)
	+ int attr_mask)
	{
	struct ibv_modify_srq cmd;

	+ if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
	+ srq = (struct ibv_srq )(((struct ibv_srq_legacy ) srq)->ibv_srq);
	+
	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
	}

	@@ -369,199 +1089,98 @@
	{
	struct ibv_query_srq cmd;

	+ if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
	+ srq = (struct ibv_srq )(((struct ibv_srq_legacy ) srq)->ibv_srq);
	+
	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
	}

	-int mlx4_destroy_srq(struct ibv_srq *ibsrq)
	+int mlx4_destroy_srq(struct ibv_srq *srq)
	{
	- struct mlx4_srq *srq = to_msrq(ibsrq);
	- struct mlx4_cq *mcq = NULL;
	int ret;
	+ struct ibv_srq *legacy_srq = NULL;

	- if (ibsrq->xrc_cq) {
	- /* is an xrc_srq */
	- mcq = to_mcq(ibsrq->xrc_cq);
	- mlx4_cq_clean(mcq, 0, srq);
	- pthread_spin_lock(&mcq->lock);
	- mlx4_clear_xrc_srq(to_mctx(ibsrq->context), srq->srqn);
	- pthread_spin_unlock(&mcq->lock);
	+ if (srq->handle == LEGACY_XRC_SRQ_HANDLE) {
	+ legacy_srq = srq;
	+ srq = (struct ibv_srq )(((struct ibv_srq_legacy ) srq)->ibv_srq);
	}

	- ret = ibv_cmd_destroy_srq(ibsrq);
	- if (ret) {
	- if (ibsrq->xrc_cq) {
	- pthread_spin_lock(&mcq->lock);
	- mlx4_store_xrc_srq(to_mctx(ibsrq->context),
	- srq->srqn, srq);
	- pthread_spin_unlock(&mcq->lock);
	- }
	- return ret;
	+ if (to_msrq(srq)->ext_srq) {
	+ ret = mlx4_destroy_xrc_srq(srq);
	+ if (ret)
	+ return ret;
	+
	+ if (legacy_srq)
	+ free(legacy_srq);
	+
	+ return 0;
	}

	- mlx4_free_db(to_mctx(ibsrq->context), MLX4_DB_TYPE_RQ, srq->db);
	- mlx4_free_buf(&srq->buf);
	- free(srq->wrid);
	- free(srq);
	+ ret = ibv_cmd_destroy_srq(srq);
	+ if (ret)
	+ return ret;
	+
	+ mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
	+ mlx4_free_buf(&to_msrq(srq)->buf);
	+ free(to_msrq(srq)->wrid);
	+ free(to_msrq(srq));

	return 0;
	}

	-static int verify_sizes(struct ibv_qp_init_attr attr, struct mlx4_context context)
	+struct ibv_qp mlx4_create_qp_ex(struct ibv_context context,
	+ struct ibv_qp_init_attr_ex *attr)
	{
	- int size;
	- int nsegs;
	-
	- if (attr->cap.max_send_wr > context->max_qp_wr \|\|
	- attr->cap.max_recv_wr > context->max_qp_wr \|\|
	- attr->cap.max_send_sge > context->max_sge \|\|
	- attr->cap.max_recv_sge > context->max_sge)
	- return -1;
	-
	- if (attr->cap.max_inline_data) {
	- nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type);
	- size = MLX4_MAX_WQE_SIZE - nsegs * sizeof (struct mlx4_wqe_inline_seg);
	- switch (attr->qp_type) {
	- case IBV_QPT_UD:
	- size -= (sizeof (struct mlx4_wqe_ctrl_seg) +
	- sizeof (struct mlx4_wqe_datagram_seg));
	- break;
	-
	- case IBV_QPT_RC:
	- case IBV_QPT_UC:
	- case IBV_QPT_XRC:
	- size -= (sizeof (struct mlx4_wqe_ctrl_seg) +
	- sizeof (struct mlx4_wqe_raddr_seg));
	- break;
	-
	- default:
	- return 0;
	- }
	-
	- if (attr->cap.max_inline_data > size)
	- return -1;
	- }
	-
	- return 0;
	+ read_init_vars(to_mctx(context));
	+ return mlx4_exp_create_qp(context, (struct ibv_exp_qp_init_attr *)attr);
	}

	struct ibv_qp mlx4_create_qp(struct ibv_pd pd, struct ibv_qp_init_attr *attr)
	{
	- struct mlx4_create_qp cmd;
	- struct ibv_create_qp_resp resp;
	- struct mlx4_qp *qp;
	- int ret;
	- struct mlx4_context *context = to_mctx(pd->context);
	-
	+ struct ibv_exp_qp_init_attr attr_exp;
	+ struct ibv_qp *qp;
	+ /* We should copy below only the shared fields excluding the xrc_domain field.
	+ * Otherwise we may have an ABI issue with applications that were compiled
	+ * without the xrc_domain field. The xrc_domain any way has no affect in
	+ * the sender side, no need to copy in/out.
	+ */
	+ int init_attr_base_size = offsetof(struct ibv_qp_init_attr, xrc_domain);
	+
	+ /* copying only shared fields */
	+ memcpy(&attr_exp, attr, init_attr_base_size);
	+ attr_exp.comp_mask = IBV_EXP_QP_INIT_ATTR_PD;
	+ attr_exp.pd = pd;
	+ qp = mlx4_exp_create_qp(pd->context, &attr_exp);
	+ if (qp)
	+ memcpy(attr, &attr_exp, init_attr_base_size);
	+ return qp;
	+}

	- /* Sanity check QP size before proceeding */
	- if (verify_sizes(attr, context))
	- return NULL;
	+struct ibv_qp mlx4_open_qp(struct ibv_context context, struct ibv_qp_open_attr *attr)
	+{
	+ struct ibv_open_qp cmd;
	+ struct ibv_create_qp_resp resp;
	+ struct mlx4_qp *qp;
	+ int ret;

	- qp = malloc(sizeof *qp);
	+ qp = calloc(1, sizeof *qp);
	if (!qp)
	return NULL;

	- mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
	-
	- /*
	- * We need to leave 2 KB + 1 WQE of headroom in the SQ to
	- * allow HW to prefetch.
	- */
	- qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
	- qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
	- qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
	-
	- if (attr->srq \|\| attr->qp_type == IBV_QPT_XRC)
	- attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
	- else {
	- if (attr->cap.max_recv_sge < 1)
	- attr->cap.max_recv_sge = 1;
	- if (attr->cap.max_recv_wr < 1)
	- attr->cap.max_recv_wr = 1;
	- }
	-
	- if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp))
	- goto err;
	-
	- mlx4_init_qp_indices(qp);
	-
	- if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) \|\|
	- pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
	- goto err_free;
	-
	- if (!attr->srq && attr->qp_type != IBV_QPT_XRC) {
	- qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
	- if (!qp->db)
	- goto err_free;
	-
	- *qp->db = 0;
	- }
	-
	- cmd.buf_addr = (uintptr_t) qp->buf.buf;
	- if (attr->srq \|\| attr->qp_type == IBV_QPT_XRC)
	- cmd.db_addr = 0;
	- else
	- cmd.db_addr = (uintptr_t) qp->db;
	- cmd.log_sq_stride = qp->sq.wqe_shift;
	- for (cmd.log_sq_bb_count = 0;
	- qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
	- ++cmd.log_sq_bb_count)
	- ; /* nothing */
	- cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */
	- memset(cmd.reserved, 0, sizeof cmd.reserved);
	-
	- pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex);
	-
	- ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd,
	- &resp, sizeof resp);
	- if (ret)
	- goto err_rq_db;
	-
	- ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp);
	+ ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
	+ &cmd, sizeof cmd, &resp, sizeof resp);
	if (ret)
	- goto err_destroy;
	- pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
	-
	- qp->rq.wqe_cnt = attr->cap.max_recv_wr;
	- qp->rq.max_gs = attr->cap.max_recv_sge;
	-
	- /* adjust rq maxima to not exceed reported device maxima */
	- attr->cap.max_recv_wr = min(context->max_qp_wr, attr->cap.max_recv_wr);
	- attr->cap.max_recv_sge = min(context->max_sge, attr->cap.max_recv_sge);
	-
	- qp->rq.max_post = attr->cap.max_recv_wr;
	- mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
	-
	- qp->doorbell_qpn = htonl(qp->ibv_qp.qp_num << 8);
	- if (attr->sq_sig_all)
	- qp->sq_signal_bits = htonl(MLX4_WQE_CTRL_CQ_UPDATE);
	- else
	- qp->sq_signal_bits = 0;
	-
	- return &qp->ibv_qp;
	-
	-err_destroy:
	- ibv_cmd_destroy_qp(&qp->ibv_qp);
	-
	-err_rq_db:
	- pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
	- if (!attr->srq && attr->qp_type != IBV_QPT_XRC)
	- mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db);
	+ goto err;

	-err_free:
	- free(qp->sq.wrid);
	- if (qp->rq.wqe_cnt)
	- free(qp->rq.wrid);
	- mlx4_free_buf(&qp->buf);
	+ return &qp->verbs_qp.qp;

	err:
	free(qp);
	-
	return NULL;
	}

	int mlx4_query_qp(struct ibv_qp ibqp, struct ibv_qp_attr attr,
	- enum ibv_qp_attr_mask attr_mask,
	+ int attr_mask,
	struct ibv_qp_init_attr *init_attr)
	{
	struct ibv_query_qp cmd;
	@@ -582,11 +1201,17 @@
	}

	int mlx4_modify_qp(struct ibv_qp qp, struct ibv_qp_attr attr,
	- enum ibv_qp_attr_mask attr_mask)
	+ int attr_mask)
	{
	struct ibv_modify_qp cmd;
	int ret;

	+ if (attr_mask & IBV_QP_PORT) {
	+ ret = update_port_data(qp, attr->port_num);
	+ if (ret)
	+ return ret;
	+ }
	+
	if (qp->state == IBV_QPS_RESET &&
	attr_mask & IBV_QP_STATE &&
	attr->qp_state == IBV_QPS_INIT) {
	@@ -598,13 +1223,14 @@
	if (!ret &&
	(attr_mask & IBV_QP_STATE) &&
	attr->qp_state == IBV_QPS_RESET) {
	- mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
	- qp->srq ? to_msrq(qp->srq) : NULL);
	- if (qp->send_cq != qp->recv_cq)
	+ if (qp->recv_cq)
	+ mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
	+ qp->srq ? to_msrq(qp->srq) : NULL);
	+ if (qp->send_cq && qp->send_cq != qp->recv_cq)
	mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);

	mlx4_init_qp_indices(to_mqp(qp));
	- if (!qp->srq && qp->qp_type != IBV_QPT_XRC)
	+ if (to_mqp(qp)->rq.wqe_cnt)
	*to_mqp(qp)->db = 0;
	}

	@@ -616,14 +1242,19 @@
	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);

	- if (send_cq == recv_cq)
	- pthread_spin_lock(&send_cq->lock);
	- else if (send_cq->cqn < recv_cq->cqn) {
	- pthread_spin_lock(&send_cq->lock);
	- pthread_spin_lock(&recv_cq->lock);
	+ if (!qp->send_cq \|\| !qp->recv_cq) {
	+ if (qp->send_cq)
	+ mlx4_lock(&send_cq->lock);
	+ else if (qp->recv_cq)
	+ mlx4_lock(&recv_cq->lock);
	+ } else if (send_cq == recv_cq) {
	+ mlx4_lock(&send_cq->lock);
	+ } else if (send_cq->cqn < recv_cq->cqn) {
	+ mlx4_lock(&send_cq->lock);
	+ mlx4_lock(&recv_cq->lock);
	} else {
	- pthread_spin_lock(&recv_cq->lock);
	- pthread_spin_lock(&send_cq->lock);
	+ mlx4_lock(&recv_cq->lock);
	+ mlx4_lock(&send_cq->lock);
	}
	}

	@@ -632,14 +1263,20 @@
	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);

	- if (send_cq == recv_cq)
	- pthread_spin_unlock(&send_cq->lock);
	- else if (send_cq->cqn < recv_cq->cqn) {
	- pthread_spin_unlock(&recv_cq->lock);
	- pthread_spin_unlock(&send_cq->lock);
	+
	+ if (!qp->send_cq \|\| !qp->recv_cq) {
	+ if (qp->send_cq)
	+ mlx4_unlock(&send_cq->lock);
	+ else if (qp->recv_cq)
	+ mlx4_unlock(&recv_cq->lock);
	+ } else if (send_cq == recv_cq) {
	+ mlx4_unlock(&send_cq->lock);
	+ } else if (send_cq->cqn < recv_cq->cqn) {
	+ mlx4_unlock(&recv_cq->lock);
	+ mlx4_unlock(&send_cq->lock);
	} else {
	- pthread_spin_unlock(&send_cq->lock);
	- pthread_spin_unlock(&recv_cq->lock);
	+ mlx4_unlock(&send_cq->lock);
	+ mlx4_unlock(&recv_cq->lock);
	}
	}

	@@ -656,246 +1293,120 @@
	}

	mlx4_lock_cqs(ibqp);
	-
	- __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
	- ibqp->srq ? to_msrq(ibqp->srq) : NULL);
	- if (ibqp->send_cq != ibqp->recv_cq)
	+ if (ibqp->recv_cq)
	+ __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
	+ ibqp->srq ? to_msrq(ibqp->srq) : NULL);
	+ if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
	__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);

	- mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
	+ if (qp->sq.wqe_cnt \|\| qp->rq.wqe_cnt)
	+ mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);

	mlx4_unlock_cqs(ibqp);
	pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);

	- if (!ibqp->srq && ibqp->qp_type != IBV_QPT_XRC)
	- mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
	- free(qp->sq.wrid);
	+ /*
	+ * Use the qp->bf to check if the QP is using dedicated BF.
	+ * If so, update the dedicated BF database.
	+ */
	+ if (qp->bf && (&qp->bf->cmn != &(to_mctx(ibqp->context)->bfs.cmn_bf))) {
	+ struct mlx4_bfs_data *bfs = &to_mctx(ibqp->context)->bfs;
	+ int idx = &(qp->bf->dedic) - bfs->dedic_bf;
	+
	+ if (0 <= idx && idx < (MLX4_MAX_BFS_IN_PAGE - 1)) {
	+ mlx4_spin_lock(&bfs->dedic_bf_lock);
	+ bfs->dedic_bf_used[idx] = 0;
	+ bfs->dedic_bf_free++;
	+ mlx4_spin_unlock(&bfs->dedic_bf_lock);
	+ }
	+ }
	+
	if (qp->rq.wqe_cnt)
	- free(qp->rq.wrid);
	- mlx4_free_buf(&qp->buf);
	+ mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
	+
	+ mlx4_dealloc_qp_buf(ibqp->context, qp);
	+
	free(qp);

	return 0;
	}

	-struct ibv_ah mlx4_create_ah(struct ibv_pd pd, struct ibv_ah_attr *attr)
	+struct ibv_ah mlx4_create_ah_common(struct ibv_pd pd,
	+ struct ibv_ah_attr *attr,
	+ uint8_t link_layer)
	{
	struct mlx4_ah *ah;
	- struct ibv_port_attr port_attr;
	- uint8_t is_mcast;
	+
	+ if (unlikely(!attr->dlid) &&
	+ (link_layer != IBV_LINK_LAYER_ETHERNET)) {
	+ errno = EINVAL;
	+ return NULL;
	+ }

	ah = malloc(sizeof *ah);
	if (!ah)
	return NULL;

	- memset(ah, 0, sizeof *ah);
	+ memset(&ah->av, 0, sizeof ah->av);

	ah->av.port_pd = htonl(to_mpd(pd)->pdn \| (attr->port_num << 24));
	- ah->av.g_slid = attr->src_path_bits;
	- ah->av.dlid = htons(attr->dlid);
	+
	+ if (link_layer != IBV_LINK_LAYER_ETHERNET) {
	+ ah->av.g_slid = attr->src_path_bits;
	+ ah->av.dlid = htons(attr->dlid);
	+ ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28);
	+ } else {
	+ ah->vlan = ((attr->sl & 7) << 13);
	+ ah->av.sl_tclass_flowlabel = htonl(attr->sl << 29);
	+ }
	+
	if (attr->static_rate) {
	ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
	/* XXX check rate cap? */
	}
	- ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28);
	if (attr->is_global) {
	ah->av.g_slid \|= 0x80;
	ah->av.gid_index = attr->grh.sgid_index;
	- ah->av.hop_limit = attr->grh.hop_limit;
	+ if (attr->grh.hop_limit < 2)
	+ ah->av.hop_limit = 0xff;
	+ else
	+ ah->av.hop_limit = attr->grh.hop_limit;
	ah->av.sl_tclass_flowlabel \|=
	htonl((attr->grh.traffic_class << 20) \|
	attr->grh.flow_label);
	memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
	}

	- if (ibv_query_port(pd->context, attr->port_num, &port_attr))
	- goto err;
	-
	- if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
	- if (ibv_resolve_eth_gid(pd, attr->port_num,
	- (union ibv_gid *)ah->av.dgid,
	- attr->grh.sgid_index,
	- ah->mac, &ah->vlan,
	- &ah->tagged, &is_mcast))
	- goto err;
	-
	- if (is_mcast) {
	- ah->av.dlid = htons(0xc000);
	- ah->av.port_pd \|= htonl(1 << 31);
	- }
	- if (ah->tagged) {
	- ah->av.port_pd \|= htonl(1 << 29);
	- ah->vlan \|= (attr->sl & 7) << 13;
	- }
	- }
	-
	-
	return &ah->ibv_ah;
	-err:
	- free(ah);
	- return NULL;
	}

	-int mlx4_destroy_ah(struct ibv_ah *ah)
	-{
	- free(to_mah(ah));
	-
	- return 0;
	-}
	-
	-#ifdef HAVE_IBV_XRC_OPS
	-struct ibv_srq mlx4_create_xrc_srq(struct ibv_pd pd,
	- struct ibv_xrc_domain *xrc_domain,
	- struct ibv_cq *xrc_cq,
	- struct ibv_srq_init_attr *attr)
	+struct ibv_ah mlx4_create_ah(struct ibv_pd pd, struct ibv_ah_attr *attr)
	{
	- struct mlx4_create_xrc_srq cmd;
	- struct mlx4_create_srq_resp resp;
	- struct mlx4_srq *srq;
	- int ret;
	-
	- /* Sanity check SRQ size before proceeding */
	- if (attr->attr.max_wr > 1 << 16 \|\| attr->attr.max_sge > 64)
	- return NULL;
	-
	- srq = malloc(sizeof *srq);
	- if (!srq)
	- return NULL;
	-
	- if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
	- goto err;
	-
	- srq->max = align_queue_size(attr->attr.max_wr + 1);
	- srq->max_gs = attr->attr.max_sge;
	- srq->counter = 0;
	-
	- if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
	- goto err;
	-
	- srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
	- if (!srq->db)
	- goto err_free;
	-
	- *srq->db = 0;
	-
	- cmd.buf_addr = (uintptr_t) srq->buf.buf;
	- cmd.db_addr = (uintptr_t) srq->db;
	-
	- ret = ibv_cmd_create_xrc_srq(pd, &srq->ibv_srq, attr,
	- xrc_domain->handle,
	- xrc_cq->handle,
	- &cmd.ibv_cmd, sizeof cmd,
	- &resp.ibv_resp, sizeof resp);
	- if (ret)
	- goto err_db;
	+ struct ibv_ah *ah;
	+ struct ibv_exp_port_attr port_attr;
	+ struct ibv_port_attr port_attr_legacy;
	+ uint8_t link_layer;

	- srq->ibv_srq.xrc_srq_num = srq->srqn = resp.srqn;
	+ port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1;
	+ port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER;

	- ret = mlx4_store_xrc_srq(to_mctx(pd->context), srq->ibv_srq.xrc_srq_num, srq);
	- if (ret)
	- goto err_destroy;
	-
	- return &srq->ibv_srq;
	-
	-err_destroy:
	- ibv_cmd_destroy_srq(&srq->ibv_srq);
	-
	-err_db:
	- mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
	-
	-err_free:
	- free(srq->wrid);
	- mlx4_free_buf(&srq->buf);
	+ if (ibv_exp_query_port(pd->context, attr->port_num, &port_attr)) {
	+ if (ibv_query_port(pd->context, attr->port_num, &port_attr_legacy))
	+ return NULL;

	-err:
	- free(srq);
	-
	- return NULL;
	-}
	-
	-struct ibv_xrc_domain mlx4_open_xrc_domain(struct ibv_context context,
	- int fd, int oflag)
	-{
	- int ret;
	- struct mlx4_open_xrc_domain_resp resp;
	- struct mlx4_xrc_domain *xrcd;
	-
	- xrcd = malloc(sizeof *xrcd);
	- if (!xrcd)
	- return NULL;
	-
	- ret = ibv_cmd_open_xrc_domain(context, fd, oflag, &xrcd->ibv_xrcd,
	- &resp.ibv_resp, sizeof resp);
	- if (ret) {
	- free(xrcd);
	- return NULL;
	+ link_layer = port_attr_legacy.link_layer;
	+ } else {
	+ link_layer = port_attr.link_layer;
	}

	- xrcd->xrcdn = resp.xrcdn;
	- return &xrcd->ibv_xrcd;
	-}
	-
	-int mlx4_close_xrc_domain(struct ibv_xrc_domain *d)
	-{
	- int ret;
	- ret = ibv_cmd_close_xrc_domain(d);
	- if (!ret)
	- free(d);
	- return ret;
	-}
	-
	-int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr,
	- uint32_t *xrc_qp_num)
	-{
	-
	- return ibv_cmd_create_xrc_rcv_qp(init_attr, xrc_qp_num);
	-}
	+ ah = mlx4_create_ah_common(pd, attr, link_layer);

	-int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num,
	- struct ibv_qp_attr *attr,
	- int attr_mask)
	-{
	- return ibv_cmd_modify_xrc_rcv_qp(xrc_domain, xrc_qp_num,
	- attr, attr_mask);
	+ return ah;
	}

	-int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num,
	- struct ibv_qp_attr *attr,
	- int attr_mask,
	- struct ibv_qp_init_attr *init_attr)
	+int mlx4_destroy_ah(struct ibv_ah *ah)
	{
	- int ret;
	-
	- ret = ibv_cmd_query_xrc_rcv_qp(xrc_domain, xrc_qp_num,
	- attr, attr_mask, init_attr);
	- if (ret)
	- return ret;
	-
	- init_attr->cap.max_send_wr = init_attr->cap.max_send_sge = 1;
	- init_attr->cap.max_recv_sge = init_attr->cap.max_recv_wr = 0;
	- init_attr->cap.max_inline_data = 0;
	- init_attr->recv_cq = init_attr->send_cq = NULL;
	- init_attr->srq = NULL;
	- init_attr->xrc_domain = xrc_domain;
	- init_attr->qp_type = IBV_QPT_XRC;
	- init_attr->qp_context = NULL;
	- attr->cap = init_attr->cap;
	+ free(to_mah(ah));

	return 0;
	}
	-
	-int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num)
	-{
	- return ibv_cmd_reg_xrc_rcv_qp(xrc_domain, xrc_qp_num);
	-}
	-
	-int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
	- uint32_t xrc_qp_num)
	-{
	- return ibv_cmd_unreg_xrc_rcv_qp(xrc_domain, xrc_qp_num);
	-}
	-
	-#endif
	Index: contrib/ofed/libmlx4/src/verbs_exp.c
	===================================================================
	--- /dev/null
	+++ contrib/ofed/libmlx4/src/verbs_exp.c
	@@ -0,0 +1,1015 @@
	+/*
	+ * Copyright (c) 2007 Cisco, Inc. All rights reserved.
	+ *
	+ * This software is available to you under a choice of one of two
	+ * licenses. You may choose to be licensed under the terms of the GNU
	+ * General Public License (GPL) Version 2, available from the file
	+ * COPYING in the main directory of this source tree, or the
	+ * OpenIB.org BSD license below:
	+ *
	+ * Redistribution and use in source and binary forms, with or
	+ * without modification, are permitted provided that the following
	+ * conditions are met:
	+ *
	+ * - Redistributions of source code must retain the above
	+ * copyright notice, this list of conditions and the following
	+ * disclaimer.
	+ *
	+ * - Redistributions in binary form must reproduce the above
	+ * copyright notice, this list of conditions and the following
	+ * disclaimer in the documentation and/or other materials
	+ * provided with the distribution.
	+ *
	+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	+ * SOFTWARE.
	+ */
	+
	+#if HAVE_CONFIG_H
	+# include <config.h>
	+#endif /* HAVE_CONFIG_H */
	+
	+#include <stdlib.h>
	+#include <stdio.h>
	+#include <string.h>
	+#include <pthread.h>
	+#include <errno.h>
	+#include <netinet/in.h>
	+#include <sys/types.h>
	+#include <sys/stat.h>
	+#include <fcntl.h>
	+#include <unistd.h>
	+/* Added for reg_mr mmap munmap system calls */
	+#include <sys/mman.h>
	+#include "mlx4.h"
	+#include "mlx4-abi.h"
	+#include "mlx4_exp.h"
	+#include "wqe.h"
	+
	+static const char *qptype2key(enum ibv_qp_type type)
	+{
	+ switch (type) {
	+ case IBV_QPT_RC: return "HUGE_RC";
	+ case IBV_QPT_UC: return "HUGE_UC";
	+ case IBV_QPT_UD: return "HUGE_UD";
	+#ifdef _NOT_EXISTS_IN_OFED_2_0
	+ case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH";
	+#endif
	+
	+ default: return "HUGE_NA";
	+ }
	+}
	+
	+static void update_qp_cap_cache(struct ibv_qp *qp)
	+{
	+ struct mlx4_context *ctx = to_mctx(qp->context);
	+ struct mlx4_qp *mqp = to_mqp(qp);
	+
	+ if (((qp->qp_type == IBV_QPT_RAW_ETH) && (mqp->link_layer == IBV_LINK_LAYER_ETHERNET)) &&
	+ (ctx->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_IP_PKT))
	+ mqp->qp_cap_cache \|= MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP;
	+}
	+
	+int update_port_data(struct ibv_qp *qp, uint8_t port_num)
	+{
	+ struct mlx4_qp *mqp = to_mqp(qp);
	+ struct ibv_port_attr port_attr;
	+ int err;
	+
	+ err = ibv_query_port(qp->context, port_num, &port_attr);
	+ if (err)
	+ return err;
	+
	+ mqp->link_layer = port_attr.link_layer;
	+ update_qp_cap_cache(qp);
	+
	+ return 0;
	+}
	+
	+int mlx4_exp_modify_qp(struct ibv_qp qp, struct ibv_exp_qp_attr attr,
	+ uint64_t attr_mask)
	+{
	+ struct ibv_exp_modify_qp cmd;
	+ int ret;
	+
	+ memset(&cmd, 0, sizeof(cmd));
	+ if (attr_mask & IBV_QP_PORT) {
	+ ret = update_port_data(qp, attr->port_num);
	+ if (ret)
	+ return ret;
	+ }
	+
	+ if (qp->state == IBV_QPS_RESET &&
	+ (attr_mask & IBV_EXP_QP_STATE) &&
	+ attr->qp_state == IBV_QPS_INIT) {
	+ mlx4_qp_init_sq_ownership(to_mqp(qp));
	+ }
	+
	+
	+ ret = ibv_exp_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd));
	+
	+ if (!ret &&
	+ (attr_mask & IBV_EXP_QP_STATE) &&
	+ attr->qp_state == IBV_QPS_RESET) {
	+ if (qp->recv_cq)
	+ mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
	+ qp->srq ? to_msrq(qp->srq) : NULL);
	+ if (qp->send_cq && qp->send_cq != qp->recv_cq)
	+ mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
	+
	+ mlx4_init_qp_indices(to_mqp(qp));
	+ if (to_mqp(qp)->rq.wqe_cnt)
	+ *to_mqp(qp)->db = 0;
	+ }
	+
	+ return ret;
	+}
	+
	+static int verify_sizes(struct ibv_exp_qp_init_attr attr, struct mlx4_context context)
	+{
	+ int size;
	+ int nsegs;
	+
	+ if (attr->cap.max_send_wr > context->max_qp_wr \|\|
	+ attr->cap.max_recv_wr > context->max_qp_wr \|\|
	+ attr->cap.max_send_sge > context->max_sge \|\|
	+ attr->cap.max_recv_sge > context->max_sge)
	+ return -1;
	+
	+ if (attr->cap.max_inline_data) {
	+ nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type);
	+ size = MLX4_MAX_WQE_SIZE - nsegs * sizeof(struct mlx4_wqe_inline_seg);
	+ switch (attr->qp_type) {
	+ case IBV_QPT_UD:
	+ size -= (sizeof(struct mlx4_wqe_ctrl_seg) +
	+ sizeof(struct mlx4_wqe_datagram_seg));
	+ break;
	+
	+ case IBV_QPT_RC:
	+ case IBV_QPT_UC:
	+ size -= (sizeof(struct mlx4_wqe_ctrl_seg) +
	+ sizeof(struct mlx4_wqe_raddr_seg));
	+ break;
	+
	+ default:
	+ return 0;
	+ }
	+
	+ if (attr->cap.max_inline_data > size)
	+ return -1;
	+ }
	+
	+ return 0;
	+}
	+
	+static int mlx4_exp_alloc_qp_buf(struct ibv_context *context,
	+ struct ibv_exp_qp_init_attr *attr,
	+ struct mlx4_qp *qp)
	+{
	+ int ret;
	+ enum mlx4_alloc_type alloc_type;
	+ enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
	+ const char *qp_huge_key;
	+ int i, wqe_size;
	+
	+ qp->rq.max_gs = attr->cap.max_recv_sge;
	+ wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
	+ if ((attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) && (attr->max_inl_recv)) {
	+ qp->max_inlr_sg = qp->rq.max_gs;
	+ wqe_size = max(wqe_size, attr->max_inl_recv);
	+ }
	+ for (qp->rq.wqe_shift = 4; 1 << qp->rq.wqe_shift < wqe_size; qp->rq.wqe_shift++)
	+ ; /* nothing */
	+
	+ if (qp->max_inlr_sg) {
	+ attr->max_inl_recv = 1 << qp->rq.wqe_shift;
	+ qp->max_inlr_sg = attr->max_inl_recv / sizeof(struct mlx4_wqe_data_seg);
	+ }
	+
	+ if (qp->sq.wqe_cnt) {
	+ qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
	+ if (!qp->sq.wrid)
	+ return -1;
	+ }
	+
	+ if (qp->rq.wqe_cnt) {
	+ qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t));
	+ if (!qp->rq.wrid) {
	+ free(qp->sq.wrid);
	+ return -1;
	+ }
	+
	+ if (qp->max_inlr_sg) {
	+ qp->inlr_buff.buff = malloc(qp->rq.wqe_cnt * sizeof(*(qp->inlr_buff.buff)));
	+ if (!qp->inlr_buff.buff) {
	+ free(qp->sq.wrid);
	+ free(qp->rq.wrid);
	+ return -1;
	+ }
	+ qp->inlr_buff.len = qp->rq.wqe_cnt;
	+ qp->inlr_buff.buff[0].sg_list = malloc(qp->rq.wqe_cnt *
	+ sizeof((qp->inlr_buff.buff->sg_list))
	+ qp->max_inlr_sg);
	+ if (!qp->inlr_buff.buff->sg_list) {
	+ free(qp->sq.wrid);
	+ free(qp->rq.wrid);
	+ free(qp->inlr_buff.buff);
	+ return -1;
	+ }
	+ for (i = 1; i < qp->rq.wqe_cnt; i++)
	+ qp->inlr_buff.buff[i].sg_list = &qp->inlr_buff.buff[0].sg_list[i * qp->max_inlr_sg];
	+ }
	+ }
	+
	+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
	+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
	+
	+ if (qp->buf_size) {
	+ /* compatability support */
	+ qp_huge_key = qptype2key(attr->qp_type);
	+ if (mlx4_use_huge(context, qp_huge_key))
	+ default_alloc_type = MLX4_ALLOC_TYPE_HUGE;
	+
	+
	+ mlx4_get_alloc_type(context, MLX4_QP_PREFIX, &alloc_type,
	+ default_alloc_type);
	+
	+ ret = mlx4_alloc_prefered_buf(to_mctx(context), &qp->buf,
	+ align(qp->buf_size, to_mdev
	+ (context->device)->page_size),
	+ to_mdev(context->device)->page_size,
	+ alloc_type,
	+ MLX4_QP_PREFIX);
	+
	+ if (ret) {
	+ free(qp->sq.wrid);
	+ free(qp->rq.wrid);
	+ if (qp->max_inlr_sg) {
	+ free(qp->inlr_buff.buff[0].sg_list);
	+ free(qp->inlr_buff.buff);
	+ }
	+ return -1;
	+ }
	+
	+ memset(qp->buf.buf, 0, qp->buf_size);
	+ if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
	+ qp->rq.buf = qp->buf.buf;
	+ qp->sq.buf = qp->buf.buf + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
	+ } else {
	+ qp->rq.buf = qp->buf.buf + (qp->sq.wqe_cnt << qp->sq.wqe_shift);
	+ qp->sq.buf = qp->buf.buf;
	+ }
	+
	+ } else {
	+ qp->buf.buf = NULL;
	+ }
	+
	+ return 0;
	+}
	+
	+static uint64_t send_db_to_uar(uintptr_t send_db)
	+{
	+ return (send_db - MLX4_SEND_DOORBELL);
	+}
	+
	+static uint32_t *uar_to_send_db(uintptr_t uar)
	+{
	+ return (uint32_t *)(uar + MLX4_SEND_DOORBELL);
	+}
	+
	+static void update_qp_bf_data(struct mlx4_res_domain *res_domain,
	+ struct mlx4_qp qp, struct ibv_context context)
	+{
	+ switch (res_domain->type) {
	+ case MLX4_RES_DOMAIN_BF_SAFE:
	+ qp->db_method = MLX4_QP_DB_METHOD_BF;
	+ break;
	+ case MLX4_RES_DOMAIN_BF_UNSAFE:
	+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
	+ break;
	+ case MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT:
	+ if (to_mctx(context)->prefer_bf)
	+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB;
	+ else
	+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
	+ break;
	+ default:
	+ break;
	+ }
	+ qp->bf = &res_domain->send_db->bf;
	+ qp->sdb = res_domain->send_db->db_addr;
	+ qp->bf_buf_size = to_mctx(context)->bfs.buf_size;
	+}
	+
	+struct ibv_qp mlx4_exp_create_qp(struct ibv_context context,
	+ struct ibv_exp_qp_init_attr *attr)
	+{
	+ struct mlx4_qp *qp;
	+ int ret;
	+ union {
	+ struct mlx4_create_qp basic;
	+ struct mlx4_exp_create_qp extended;
	+ } cmd_obj;
	+ union {
	+ struct ibv_create_qp_resp basic;
	+ struct ibv_exp_create_qp_resp extended;
	+ } resp_obj;
	+ struct mlx4_create_qp_base *cmd = NULL;
	+ int ext_kernel_cmd = 0;
	+ struct mlx4_bfs_data *bfs = &to_mctx(context)->bfs;
	+ int i;
	+ unsigned char cq_update;
	+ int thread_safe = !mlx4_single_threaded;
	+ int db_method_defined = 0;
	+
	+ memset(&resp_obj, 0, sizeof(resp_obj));
	+ memset(&cmd_obj, 0, sizeof(cmd_obj));
	+
	+ if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_RESERVED1) {
	+ errno = ENOSYS;
	+ return NULL;
	+ }
	+
	+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) {
	+ if (attr->srq)
	+ attr->max_inl_recv = 0;
	+ else
	+ attr->max_inl_recv = min(attr->max_inl_recv,
	+ (to_mctx(context)->max_sge *
	+ sizeof(struct mlx4_wqe_data_seg)));
	+ }
	+
	+ /* Sanity check QP size before proceeding */
	+ if (verify_sizes(attr, to_mctx(context)))
	+ return NULL;
	+
	+ if (attr->qp_type == IBV_QPT_XRC && attr->recv_cq &&
	+ attr->cap.max_recv_wr > 0 && mlx4_trace)
	+ fprintf(stderr, PFX "Warning: Legacy XRC sender should not use a recieve cq\n");
	+
	+ qp = calloc(1, sizeof(*qp));
	+ if (!qp)
	+ return NULL;
	+
	+ qp->qp_cap_cache = 0;
	+ if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS)
	+ ext_kernel_cmd = 1;
	+ if (attr->qp_type == IBV_QPT_XRC_RECV) {
	+ attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
	+ } else {
	+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG &&
	+ attr->max_atomic_arg != 0) {
	+ if (attr->max_atomic_arg == 8) {
	+ qp->is_masked_atomic = 1;
	+ } else {
	+ fprintf(stderr, "%s: max_atomic_arg = %d is not valid for mlx4 (use 8 or 0)\n",
	+ __FUNCTION__, attr->max_atomic_arg);
	+ errno = EINVAL;
	+ goto err;
	+ }
	+ }
	+
	+ mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
	+ /*
	+ * We need to leave 2 KB + 1 WQE of headroom in the SQ to
	+ * allow HW to prefetch.
	+ */
	+#ifdef MLX4_WQE_FORMAT
	+ qp->sq_spare_wqes = 0;
	+#else
	+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
	+#endif
	+ qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
	+ }
	+
	+ if (attr->srq \|\| attr->qp_type == IBV_QPT_XRC_SEND \|\|
	+ attr->qp_type == IBV_QPT_XRC_RECV \|\|
	+ attr->qp_type == IBV_QPT_XRC) {
	+ attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
	+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV)
	+ attr->max_inl_recv = 0;
	+ } else {
	+ qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
	+ if (attr->cap.max_recv_sge < 1)
	+ attr->cap.max_recv_sge = 1;
	+ if (attr->cap.max_recv_wr < 1)
	+ attr->cap.max_recv_wr = 1;
	+ }
	+
	+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS)
	+ qp->create_flags = attr->exp_create_flags & IBV_EXP_QP_CREATE_MASK;
	+
	+ if (mlx4_exp_alloc_qp_buf(context, attr, qp))
	+ goto err;
	+
	+ mlx4_init_qp_indices(qp);
	+
	+ qp->sdb = (uint32_t *) (to_mctx(context)->uar + MLX4_SEND_DOORBELL);
	+ if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_RES_DOMAIN) {
	+ struct mlx4_res_domain *rd;
	+
	+ if (!attr->res_domain) {
	+ errno = EINVAL;
	+ goto err_free;
	+ }
	+ rd = to_mres_domain(attr->res_domain);
	+ if (rd->attr.thread_model == IBV_EXP_THREAD_UNSAFE \|\|
	+ rd->attr.thread_model == IBV_EXP_THREAD_SINGLE)
	+ thread_safe = 0;
	+
	+ if (rd->send_db) {
	+ cmd_obj.extended.exp_cmd.uar_virt_add = send_db_to_uar((uintptr_t)rd->send_db->db_addr);
	+ update_qp_bf_data(rd, qp, context);
	+ db_method_defined = 1;
	+ }
	+ }
	+
	+ if (mlx4_lock_init(&qp->sq.lock, thread_safe, mlx4_get_locktype()))
	+ goto err_free;
	+ if (mlx4_lock_init(&qp->rq.lock, thread_safe, mlx4_get_locktype()))
	+ goto sq_lock_destroy;
	+
	+ cmd = (ext_kernel_cmd ?
	+ &cmd_obj.extended.exp_cmd.base : &cmd_obj.basic.base);
	+
	+ if (attr->cap.max_recv_sge) {
	+ qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
	+ if (!qp->db)
	+ goto rq_lock_destroy;
	+
	+ *qp->db = 0;
	+ cmd->db_addr = (uintptr_t) qp->db;
	+ } else {
	+ cmd->db_addr = 0;
	+ }
	+
	+ cmd->buf_addr = (uintptr_t) qp->buf.buf;
	+ cmd->log_sq_stride = qp->sq.wqe_shift;
	+ for (cmd->log_sq_bb_count = 0;
	+ qp->sq.wqe_cnt > 1 << cmd->log_sq_bb_count;
	+ ++cmd->log_sq_bb_count)
	+ ; /* nothing */
	+ cmd->sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */
	+ memset(cmd->reserved, 0, sizeof(cmd->reserved));
	+
	+ pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
	+ ret = ibv_exp_cmd_create_qp(context, &qp->verbs_qp,
	+ sizeof(qp->verbs_qp), attr,
	+ ext_kernel_cmd ?
	+ (void *)&cmd_obj.extended.ibv_cmd :
	+ (void *)&cmd_obj.basic.ibv_cmd,
	+ ext_kernel_cmd ?
	+ sizeof(cmd_obj.extended.ibv_cmd) :
	+ sizeof(cmd_obj.basic.ibv_cmd),
	+ ext_kernel_cmd ?
	+ sizeof(cmd_obj.extended.exp_cmd) :
	+ sizeof(cmd_obj.basic.base),
	+ ext_kernel_cmd ?
	+ (void )&resp_obj.extended : (void )&resp_obj.basic,
	+ ext_kernel_cmd ?
	+ sizeof(resp_obj.extended) :
	+ sizeof(resp_obj.basic),
	+ 0, 0);
	+ if (ret) {
	+ errno = ret;
	+ goto err_rq_db;
	+ }
	+
	+ if (qp->max_inlr_sg && (attr->max_inl_recv != (1 << qp->rq.wqe_shift)))
	+ goto err_destroy;
	+
	+ if (qp->sq.wqe_cnt \|\| qp->rq.wqe_cnt) {
	+ ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
	+ if (ret)
	+ goto err_destroy;
	+ }
	+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
	+
	+ qp->rq.wqe_cnt = attr->cap.max_recv_wr;
	+ qp->rq.max_gs = attr->cap.max_recv_sge;
	+
	+ /* adjust rq maxima to not exceed reported device maxima */
	+ attr->cap.max_recv_wr = min(to_mctx(context)->max_qp_wr,
	+ attr->cap.max_recv_wr);
	+ attr->cap.max_recv_sge = min(to_mctx(context)->max_sge,
	+ attr->cap.max_recv_sge);
	+
	+ qp->rq.max_post = attr->cap.max_recv_wr;
	+ if (attr->qp_type != IBV_QPT_XRC_RECV)
	+ mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
	+
	+ qp->doorbell_qpn = htonl(qp->verbs_qp.qp.qp_num << 8);
	+ if (attr->sq_sig_all)
	+ cq_update = MLX4_WQE_CTRL_CQ_UPDATE;
	+ else
	+ cq_update = 0;
	+
	+ /*
	+ * The rcrb_flags_tbl is a table to get the right value for the first
	+ * byte of srcrb_flags field on the WQE ctrl segment.
	+ * The value is derived from the QP sq_sig_all flag and the 4 WR flags
	+ * IBV_EXP_SEND_SIGNALED, IBV_EXP_SEND_SOLICITED, IBV_EXP_SEND_IP_CSUM
	+ * and IBV_EXP_SEND_TUNNEL.
	+ * These flags used as an index to get the required value from the table.
	+ * The IBV_EXP_SEND_SIGNALED flag defines first bit of the index the
	+ * IBV_EXP_SEND_SOLICITED defines the second bit the IBV_EXP_SEND_IP_CSUM
	+ * defines the third bit and IBV_EXP_SEND_TUNNEL the fourth one.
	+ * Therefore to calculate the index we can use:
	+ * idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED \|
	+ * (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) \|
	+ * (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2);
	+ * (exp_send_flags & IBV_EXP_SEND_TUNNEL)/(IBV_EXP_SEND_TUNNEL >> 3);
	+ */
	+ qp->srcrb_flags_tbl[0] = cq_update;
	+ qp->srcrb_flags_tbl[1] = MLX4_WQE_CTRL_CQ_UPDATE \| cq_update;
	+ qp->srcrb_flags_tbl[2] = MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+ qp->srcrb_flags_tbl[3] = MLX4_WQE_CTRL_CQ_UPDATE \| MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+ qp->srcrb_flags_tbl[4] = MLX4_WQE_CTRL_IP_CSUM \| MLX4_WQE_CTRL_TCP_UDP_CSUM \| cq_update;
	+ qp->srcrb_flags_tbl[5] = MLX4_WQE_CTRL_IP_CSUM \| MLX4_WQE_CTRL_TCP_UDP_CSUM \| MLX4_WQE_CTRL_CQ_UPDATE \| cq_update;
	+ qp->srcrb_flags_tbl[6] = MLX4_WQE_CTRL_IP_CSUM \| MLX4_WQE_CTRL_TCP_UDP_CSUM \| MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+ qp->srcrb_flags_tbl[7] = MLX4_WQE_CTRL_IP_CSUM \| MLX4_WQE_CTRL_TCP_UDP_CSUM \| MLX4_WQE_CTRL_CQ_UPDATE \| MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+ qp->srcrb_flags_tbl[8] = cq_update;
	+ qp->srcrb_flags_tbl[9] = MLX4_WQE_CTRL_CQ_UPDATE \| cq_update;
	+ qp->srcrb_flags_tbl[10] = MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+ qp->srcrb_flags_tbl[11] = MLX4_WQE_CTRL_CQ_UPDATE \| MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+ qp->srcrb_flags_tbl[12] = MLX4_WQE_CTRL_IP_CSUM \| cq_update;
	+ qp->srcrb_flags_tbl[13] = MLX4_WQE_CTRL_IP_CSUM \| MLX4_WQE_CTRL_CQ_UPDATE \| cq_update;
	+ qp->srcrb_flags_tbl[14] = MLX4_WQE_CTRL_IP_CSUM \| MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+ qp->srcrb_flags_tbl[15] = MLX4_WQE_CTRL_IP_CSUM \| MLX4_WQE_CTRL_CQ_UPDATE \| MLX4_WQE_CTRL_SOLICIT \| cq_update;
	+
	+ qp->qp_type = attr->qp_type;
	+
	+ /* Set default value of cached RX csum flags to 0 */
	+ qp->cached_rx_csum_flags = 0;
	+ /* Set transposed_rx_csum_flags to match the cached_rx_csum_flags = 0 */
	+ qp->transposed_rx_csum_flags = IBV_EXP_CQ_RX_OUTER_IPV6_PACKET;
	+
	+ if (!db_method_defined && bfs->buf_size == 0) {
	+ /* not using BF */
	+ qp->db_method = MLX4_QP_DB_METHOD_DB;
	+ } else if (!db_method_defined) {
	+ /*
	+ * To gain performance the dedic_bf_free is first tested without taking
	+ * the dedic_bf_lock.
	+ */
	+ if (bfs->dedic_bf_free) {
	+ mlx4_spin_lock(&bfs->dedic_bf_lock);
	+ for (i = 0 ; i < bfs->num_dedic_bfs; i++) {
	+ if (!bfs->dedic_bf_used[i]) {
	+ /* using dedicated BF */
	+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
	+ qp->bf = (union mlx4_bf *)(&bfs->dedic_bf[i]);
	+ bfs->dedic_bf_used[i] = 1;
	+ bfs->dedic_bf_free--;
	+ break;
	+ }
	+ }
	+ mlx4_spin_unlock(&bfs->dedic_bf_lock);
	+ }
	+ if (!qp->bf) {
	+ /* using common BF */
	+ if (mlx4_single_threaded)
	+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
	+ else
	+ qp->db_method = MLX4_QP_DB_METHOD_BF;
	+ qp->bf = (union mlx4_bf *)(&bfs->cmn_bf);
	+ }
	+ if (qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF &&
	+ mlx4_single_threaded && (wc_auto_evict_size() == 64)) {
	+ if (to_mctx(context)->prefer_bf)
	+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB;
	+ else
	+ qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
	+ }
	+ qp->bf_buf_size = bfs->buf_size;
	+ }
	+
	+ qp->model_flags = thread_safe ? MLX4_QP_MODEL_FLAG_THREAD_SAFE : 0;
	+ mlx4_update_post_send_one(qp);
	+ qp->pattern = MLX4_QP_PATTERN;
	+
	+ return &qp->verbs_qp.qp;
	+
	+err_destroy:
	+ ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
	+
	+err_rq_db:
	+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
	+ if (attr->cap.max_recv_sge)
	+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
	+
	+rq_lock_destroy:
	+ mlx4_lock_destroy(&qp->rq.lock);
	+
	+sq_lock_destroy:
	+ mlx4_lock_destroy(&qp->sq.lock);
	+
	+err_free:
	+ mlx4_dealloc_qp_buf(context, qp);
	+
	+err:
	+ free(qp);
	+
	+ return NULL;
	+}
	+
	+int mlx4_exp_query_device(struct ibv_context *context,
	+ struct ibv_exp_device_attr *device_attr)
	+{
	+ struct ibv_exp_query_device cmd;
	+ struct ibv_port_attr port_attr;
	+ uint64_t raw_fw_ver;
	+ int ret;
	+ int i;
	+
	+ ret = ibv_exp_cmd_query_device(context, device_attr, &raw_fw_ver,
	+ &cmd, sizeof(cmd));
	+ if (ret)
	+ return ret;
	+
	+ if (device_attr->exp_device_cap_flags & IBV_EXP_DEVICE_CROSS_CHANNEL) {
	+ device_attr->comp_mask \|= IBV_EXP_DEVICE_ATTR_CALC_CAP;
	+ device_attr->calc_cap.data_types = (1ULL << IBV_EXP_CALC_DATA_TYPE_INT) \|
	+ (1ULL << IBV_EXP_CALC_DATA_TYPE_UINT) \|
	+ (1ULL << IBV_EXP_CALC_DATA_TYPE_FLOAT);
	+ device_attr->calc_cap.data_sizes = (1ULL << IBV_EXP_CALC_DATA_SIZE_64_BIT);
	+ device_attr->calc_cap.int_ops = (1ULL << IBV_EXP_CALC_OP_ADD) \|
	+ (1ULL << IBV_EXP_CALC_OP_BAND) \|
	+ (1ULL << IBV_EXP_CALC_OP_BXOR) \|
	+ (1ULL << IBV_EXP_CALC_OP_BOR);
	+ device_attr->calc_cap.uint_ops = device_attr->calc_cap.int_ops;
	+ device_attr->calc_cap.fp_ops = device_attr->calc_cap.int_ops;
	+ }
	+ device_attr->exp_device_cap_flags \|= IBV_EXP_DEVICE_MR_ALLOCATE;
	+
	+ if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS) &&
	+ (device_attr->exp_device_cap_flags & (IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT \|
	+ IBV_EXP_DEVICE_RX_CSUM_IP_PKT \|
	+ IBV_EXP_DEVICE_VXLAN_SUPPORT))) {
	+ for (i = 0; i < device_attr->phys_port_cnt; i++) {
	+ ret = mlx4_query_port(context, i + 1, &port_attr);
	+ if (ret)
	+ return ret;
	+
	+ if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
	+ device_attr->exp_device_cap_flags &= ~(IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT \|
	+ IBV_EXP_DEVICE_RX_CSUM_IP_PKT \|
	+ IBV_EXP_DEVICE_VXLAN_SUPPORT);
	+ break;
	+ }
	+ }
	+ }
	+
	+ return __mlx4_query_device(
	+ raw_fw_ver,
	+ (struct ibv_device_attr *)device_attr);
	+}
	+
	+int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num,
	+ struct ibv_exp_port_attr *port_attr)
	+{
	+ /* Check that only valid flags were given */
	+ if (!(port_attr->comp_mask & IBV_EXP_QUERY_PORT_ATTR_MASK1) \|\|
	+ (port_attr->comp_mask & ~IBV_EXP_QUERY_PORT_ATTR_MASKS) \|\|
	+ (port_attr->mask1 & ~IBV_EXP_QUERY_PORT_MASK)) {
	+ return EINVAL;
	+ }
	+
	+ /* Optimize the link type query */
	+ if (port_attr->comp_mask == IBV_EXP_QUERY_PORT_ATTR_MASK1) {
	+ if (!(port_attr->mask1 & ~(IBV_EXP_QUERY_PORT_LINK_LAYER \|
	+ IBV_EXP_QUERY_PORT_CAP_FLAGS))) {
	+ struct mlx4_context *mctx = to_mctx(context);
	+ if (port_num <= 0 \|\| port_num > MLX4_PORTS_NUM)
	+ return EINVAL;
	+ if (mctx->port_query_cache[port_num - 1].valid) {
	+ if (port_attr->mask1 &
	+ IBV_EXP_QUERY_PORT_LINK_LAYER)
	+ port_attr->link_layer =
	+ mctx->
	+ port_query_cache[port_num - 1].
	+ link_layer;
	+ if (port_attr->mask1 &
	+ IBV_EXP_QUERY_PORT_CAP_FLAGS)
	+ port_attr->port_cap_flags =
	+ mctx->
	+ port_query_cache[port_num - 1].
	+ caps;
	+ return 0;
	+ }
	+ }
	+ if (port_attr->mask1 & IBV_EXP_QUERY_PORT_STD_MASK) {
	+ return mlx4_query_port(context, port_num,
	+ &port_attr->port_attr);
	+ }
	+ }
	+
	+ return EOPNOTSUPP;
	+}
	+
	+struct ibv_ah mlx4_exp_create_ah(struct ibv_pd pd,
	+ struct ibv_exp_ah_attr *attr_ex)
	+{
	+ struct ibv_exp_port_attr port_attr;
	+ struct ibv_ah *ah;
	+ struct mlx4_ah *mah;
	+
	+ port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1;
	+ port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER;
	+
	+ if (ibv_exp_query_port(pd->context, attr_ex->port_num, &port_attr))
	+ return NULL;
	+
	+ ah = mlx4_create_ah_common(pd, (struct ibv_ah_attr *)attr_ex,
	+ port_attr.link_layer);
	+
	+ if (NULL == ah)
	+ return NULL;
	+
	+ mah = to_mah(ah);
	+
	+ /* If vlan was given, check that we could use it */
	+ if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID &&
	+ attr_ex->vid <= 0xfff &&
	+ (0 == attr_ex->ll_address.len \|\|
	+ !(attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL)))
	+ goto err;
	+
	+ /* ll_address.len == 0 means no ll address given */
	+ if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL &&
	+ 0 != attr_ex->ll_address.len) {
	+ if (LL_ADDRESS_ETH != attr_ex->ll_address.type \|\|
	+ port_attr.link_layer != IBV_LINK_LAYER_ETHERNET)
	+ /* mlx4 provider currently only support ethernet
	+ * extensions */
	+ goto err;
	+
	+ /* link layer is ethernet */
	+ if (6 != attr_ex->ll_address.len \|\|
	+ NULL == attr_ex->ll_address.address)
	+ goto err;
	+
	+ memcpy(mah->mac, attr_ex->ll_address.address,
	+ attr_ex->ll_address.len);
	+
	+ if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID &&
	+ attr_ex->vid <= 0xfff) {
	+ mah->av.port_pd \|= htonl(1 << 29);
	+ mah->vlan = attr_ex->vid \|
	+ ((attr_ex->sl & 7) << 13);
	+ }
	+ }
	+
	+ return ah;
	+
	+err:
	+ free(ah);
	+ return NULL;
	+}
	+
	+static struct mlx4_send_db_data allocate_send_db(struct mlx4_context ctx)
	+{
	+ struct mlx4_device *dev = to_mdev(ctx->ibv_ctx.device);
	+ struct mlx4_send_db_data *send_db = NULL;
	+ unsigned int uar_idx;
	+ void *uar;
	+ void *bfs;
	+ int i;
	+
	+ if (!ctx->max_ctx_res_domain \|\| !ctx->bfs.buf_size) {
	+ errno = EINVAL;
	+ return NULL;
	+ }
	+
	+ mlx4_spin_lock(&ctx->send_db_lock);
	+ if (!list_empty(&ctx->send_db_list)) {
	+ send_db = list_entry(ctx->send_db_list.next, struct mlx4_send_db_data, list);
	+ list_del(&send_db->list);
	+ }
	+ mlx4_spin_unlock(&ctx->send_db_lock);
	+
	+ if (!send_db) {
	+ /* Fill up more send_db objects */
	+ mlx4_spin_lock(&ctx->send_db_lock);
	+ if ((ctx->send_db_num_uars + 1) * ctx->bf_regs_per_page >= ctx->max_ctx_res_domain) {
	+ mlx4_spin_unlock(&ctx->send_db_lock);
	+ errno = ENOMEM;
	+ return NULL;
	+ }
	+ uar_idx = ctx->send_db_num_uars;
	+ ctx->send_db_num_uars++;
	+ mlx4_spin_unlock(&ctx->send_db_lock);
	+
	+ uar = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED,
	+ ctx->ibv_ctx.cmd_fd,
	+ dev->page_size * (MLX4_IB_EXP_MMAP_EXT_UAR_PAGE \|
	+ (uar_idx << MLX4_MMAP_CMD_BITS)));
	+ if (uar == MAP_FAILED)
	+ return NULL;
	+ bfs = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED,
	+ ctx->ibv_ctx.cmd_fd,
	+ dev->page_size * (MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE \|
	+ (uar_idx << MLX4_MMAP_CMD_BITS)));
	+ if (bfs == MAP_FAILED) {
	+ munmap(uar, dev->page_size);
	+ return NULL;
	+ }
	+ mlx4_spin_lock(&ctx->send_db_lock);
	+ for (i = 0; i < ctx->bf_regs_per_page; i++) {
	+ send_db = calloc(1, sizeof(*send_db));
	+ if (!send_db) {
	+ if (i)
	+ break;
	+ mlx4_spin_unlock(&ctx->send_db_lock);
	+ errno = ENOMEM;
	+ return NULL;
	+ }
	+
	+ mlx4_lock_init(&send_db->bf.cmn.lock,
	+ !mlx4_single_threaded,
	+ mlx4_get_locktype());
	+
	+ send_db->db_addr = uar_to_send_db((uintptr_t)uar);
	+
	+ /* Allocate a pair of blue-flames to toggle sends between them */
	+ send_db->bf.cmn.address = bfs + (i * ctx->bfs.buf_size * 2);
	+ list_add(&send_db->list, &ctx->send_db_list);
	+ }
	+
	+ /* Return the last send_db object to the caller */
	+ list_del(&send_db->list);
	+ mlx4_spin_unlock(&ctx->send_db_lock);
	+ }
	+
	+ return send_db;
	+}
	+
	+static void free_send_db(struct mlx4_context *ctx,
	+ struct mlx4_send_db_data *send_db)
	+{
	+ mlx4_spin_lock(&ctx->send_db_lock);
	+ list_add(&send_db->list, &ctx->send_db_list);
	+ mlx4_spin_unlock(&ctx->send_db_lock);
	+}
	+
	+struct ibv_exp_res_domain mlx4_exp_create_res_domain(struct ibv_context context,
	+ struct ibv_exp_res_domain_init_attr *attr)
	+{
	+ struct mlx4_context *ctx = to_mctx(context);
	+ struct mlx4_res_domain *res_domain;
	+
	+ if (attr->comp_mask >= IBV_EXP_RES_DOMAIN_RESERVED) {
	+ errno = EINVAL;
	+ return NULL;
	+ }
	+
	+ res_domain = calloc(1, sizeof(*res_domain));
	+ if (!res_domain) {
	+ errno = ENOMEM;
	+ return NULL;
	+ }
	+
	+ res_domain->ibv_res_domain.context = context;
	+
	+ /* set default values */
	+ res_domain->attr.thread_model = IBV_EXP_THREAD_SAFE;
	+ res_domain->attr.msg_model = IBV_EXP_MSG_DEFAULT;
	+ /* get requested valid values */
	+ if (attr->comp_mask & IBV_EXP_RES_DOMAIN_THREAD_MODEL)
	+ res_domain->attr.thread_model = attr->thread_model;
	+ if (attr->comp_mask & IBV_EXP_RES_DOMAIN_MSG_MODEL)
	+ res_domain->attr.msg_model = attr->msg_model;
	+ res_domain->attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL \|
	+ IBV_EXP_RES_DOMAIN_MSG_MODEL;
	+ /*
	+ * Allocate BF for every resource domain since BF is improving
	+ * both BW and latency of single message.
	+ */
	+ res_domain->send_db = allocate_send_db(ctx);
	+
	+ /* define resource domain type */
	+ if (!res_domain->send_db) {
	+ if (res_domain->attr.msg_model == IBV_EXP_MSG_FORCE_LOW_LATENCY)
	+ /*
	+ * Fail in case user asked for force low-latency
	+ * resource-domain but we can't allocate
	+ * dedicated BF.
	+ */
	+ goto err;
	+ else
	+ /*
	+ * Dedicated BF is not allocated for the
	+ * resource-domain.
	+ */
	+ res_domain->type = MLX4_RES_DOMAIN_BF_NONE;
	+ } else {
	+ /*
	+ * In case dedicated BF allocated set the
	+ * resource-domain type according to the
	+ * thread-model
	+ */
	+ switch (res_domain->attr.thread_model) {
	+ case IBV_EXP_THREAD_SAFE:
	+ res_domain->type = MLX4_RES_DOMAIN_BF_SAFE;
	+ break;
	+ case IBV_EXP_THREAD_UNSAFE:
	+ res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE;
	+ break;
	+ case IBV_EXP_THREAD_SINGLE:
	+ if (wc_auto_evict_size() == 64)
	+ res_domain->type = MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT;
	+ else
	+ res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE;
	+ break;
	+ }
	+ }
	+
	+ return &res_domain->ibv_res_domain;
	+
	+err:
	+ free(res_domain);
	+
	+ return NULL;
	+}
	+
	+int mlx4_exp_destroy_res_domain(struct ibv_context *context,
	+ struct ibv_exp_res_domain *res_dom,
	+ struct ibv_exp_destroy_res_domain_attr *attr)
	+{
	+ struct mlx4_res_domain *res_domain = to_mres_domain(res_dom);
	+
	+ if (res_domain->send_db)
	+ free_send_db(to_mctx(context), res_domain->send_db);
	+
	+ free(res_domain);
	+
	+ return 0;
	+}
	+
	+void mlx4_exp_query_intf(struct ibv_context context, struct ibv_exp_query_intf_params *params,
	+ enum ibv_exp_query_intf_status *status)
	+{
	+ void *family = NULL;
	+ struct mlx4_qp *qp;
	+ struct mlx4_cq *cq;
	+
	+ *status = IBV_EXP_INTF_STAT_OK;
	+
	+ if (!params->obj) {
	+ errno = EINVAL;
	+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ;
	+
	+ return NULL;
	+ }
	+
	+ if (params->intf_version > MLX4_MAX_FAMILY_VER) {
	+ *status = IBV_EXP_INTF_STAT_VERSION_NOT_SUPPORTED;
	+
	+ return NULL;
	+ }
	+
	+ switch (params->intf) {
	+ case IBV_EXP_INTF_QP_BURST:
	+ qp = to_mqp(params->obj);
	+ if (qp->pattern == MLX4_QP_PATTERN) {
	+ family = mlx4_get_qp_burst_family(qp, params, status);
	+ if (*status != IBV_EXP_INTF_STAT_OK) {
	+ fprintf(stderr, PFX "Failed to get QP burst family\n");
	+ errno = EINVAL;
	+ }
	+ } else {
	+ fprintf(stderr, PFX "Warning: non-valid QP passed to query interface\n");
	+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ;
	+ errno = EINVAL;
	+ }
	+ break;
	+
	+ case IBV_EXP_INTF_CQ:
	+ cq = to_mcq(params->obj);
	+ if (cq->pattern == MLX4_CQ_PATTERN) {
	+ family = (void *)mlx4_get_poll_cq_family(cq, params, status);
	+ } else {
	+ fprintf(stderr, PFX "Warning: non-valid CQ passed to query interface\n");
	+ *status = IBV_EXP_INTF_STAT_INVAL_OBJ;
	+ errno = EINVAL;
	+ }
	+ break;
	+
	+ default:
	+ *status = IBV_EXP_INTF_STAT_INTF_NOT_SUPPORTED;
	+ errno = EINVAL;
	+ }
	+
	+ return family;
	+}
	+
	+int mlx4_exp_release_intf(struct ibv_context context, void intf,
	+ struct ibv_exp_release_intf_params *params)
	+{
	+ return 0;
	+}
	Index: contrib/ofed/libmlx4/src/wqe.h
	===================================================================
	--- contrib/ofed/libmlx4/src/wqe.h
	+++ contrib/ofed/libmlx4/src/wqe.h
	@@ -38,9 +38,19 @@
	};

	enum {
	- MLX4_WQE_CTRL_FENCE = 1 << 6,
	- MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
	- MLX4_WQE_CTRL_SOLICIT = 1 << 1,
	+ MLX4_WQE_CTRL_FENCE = 1 << 6,
	+ MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
	+ MLX4_WQE_CTRL_SOLICIT = 1 << 1,
	+ MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7,
	+ MLX4_WQE_CTRL_IIP = 1 << 28,
	+ MLX4_WQE_CTRL_IL4 = 1 << 27,
	+ MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5,
	+ MLX4_WQE_CTRL_IP_CSUM = 1 << 4,
	+};
	+
	+enum {
	+ MLX4_WQE_BIND_TYPE_2 = (1<<31),
	+ MLX4_WQE_BIND_ZERO_BASED = (1<<30),
	};

	enum {
	@@ -54,8 +64,7 @@

	struct mlx4_wqe_ctrl_seg {
	uint32_t owner_opcode;
	- uint16_t vlan_tag;
	- uint8_t ins_vlan;
	+ uint8_t reserved[3];
	uint8_t fence_size;
	/*
	* High 24 bits are SRC remote buffer; low 8 bits are flags:
	@@ -66,7 +75,10 @@
	* [1] SE (solicited event)
	* [0] FL (force loopback)
	*/
	- uint32_t xrcrb_flags;
	+ union {
	+ uint32_t srcrb_flags;
	+ uint16_t srcrb_flags16[2];
	+ };
	/*
	* imm is immediate data for send/RDMA write w/ immediate;
	* also invalidation key for send with invalidate; input
	@@ -99,6 +111,19 @@
	uint32_t reserved2[3];
	};

	+struct mlx4_wqe_local_inval_seg {
	+ uint64_t reserved1;
	+ uint32_t mem_key;
	+ uint32_t reserved2;
	+ uint64_t reserved3[2];
	+};
	+
	+enum {
	+ MLX4_WQE_MW_REMOTE_READ = 1 << 29,
	+ MLX4_WQE_MW_REMOTE_WRITE = 1 << 30,
	+ MLX4_WQE_MW_ATOMIC = 1 << 31
	+};
	+
	struct mlx4_wqe_raddr_seg {
	uint64_t raddr;
	uint32_t rkey;
	@@ -110,6 +135,13 @@
	uint64_t compare;
	};

	+struct mlx4_wqe_masked_atomic_seg {
	+ uint64_t swap_data;
	+ uint64_t cmp_data;
	+ uint64_t swap_mask;
	+ uint64_t cmp_mask;
	+};
	+
	struct mlx4_wqe_bind_seg {
	uint32_t flags1;
	uint32_t flags2;
	@@ -119,4 +151,11 @@
	uint64_t length;
	};

	+struct mlx4_wqe_wait_en_seg {
	+ uint32_t valid;
	+ uint32_t resv;
	+ uint32_t pi;
	+ uint32_t obj_num;
	+};
	+
	#endif /* WQE_H */
	Index: contrib/ofed/usr.lib/libmlx4/Makefile
	===================================================================
	--- contrib/ofed/usr.lib/libmlx4/Makefile
	+++ contrib/ofed/usr.lib/libmlx4/Makefile
	@@ -14,7 +14,7 @@
	SHLIB_MAJOR= 1
	MK_PROFILE= no

	-SRCS= buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c
	+SRCS= buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c verbs_exp.c

	LIBADD= ibverbs pthread
	CFLAGS+= -DHAVE_CONFIG_H
	Index: contrib/ofed/usr.lib/libmlx4/config.h
	===================================================================
	--- contrib/ofed/usr.lib/libmlx4/config.h
	+++ contrib/ofed/usr.lib/libmlx4/config.h
	@@ -1,4 +1,3 @@
	-#define HAVE_IBV_DONTFORK_RANGE
	-#define HAVE_IBV_DOFORK_RANGE
	-#define HAVE_IBV_REGISTER_DRIVER
	-#define HAVE_IBV_READ_SYSFS_FILE
	+#define HAVE_IBV_DOFORK_RANGE 1
	+#define HAVE_IBV_DONTFORK_RANGE 1
	+#define HAVE_IBV_REGISTER_DRIVER 1

File Metadata

Mime Type: text/plain
Expires: Wed, Feb 11, 4:26 AM (5 h, 7 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 28637975
Default Alt Text: D5793.diff (308 KB)

D5793.diffNo OneTemporaryActions

D5793.diffView Options

File Metadata

Event Timeline

D5793.diff
No OneTemporary
Actions

D5793.diff
View Options