Index: contrib/ofed/libmlx4/Makefile.am
===================================================================
--- contrib/ofed/libmlx4/Makefile.am
+++ contrib/ofed/libmlx4/Makefile.am
@@ -1,12 +1,19 @@
-AM_CFLAGS = -g -Wall -D_GNU_SOURCE
+AM_CFLAGS = -g -Wall -Werror -D_GNU_SOURCE
 
 mlx4_version_script = @MLX4_VERSION_SCRIPT@
 
 MLX4_SOURCES = src/buf.c src/cq.c src/dbrec.c src/mlx4.c src/qp.c \
-    src/srq.c src/verbs.c
+    src/srq.c src/verbs.c src/verbs_exp.c
+noinst_HEADERS = src/bitmap.h src/doorbell.h src/list.h src/mlx4-abi.h src/mlx4_exp.h src/mlx4.h src/wqe.h
 
 if HAVE_IBV_DEVICE_LIBRARY_EXTENSION
-    lib_LTLIBRARIES = src/libmlx4.la
+   lib_LTLIBRARIES =
+else
+    mlx4lib_LTLIBRARIES =
+endif
+
+if HAVE_IBV_DEVICE_LIBRARY_EXTENSION
+    lib_LTLIBRARIES += src/libmlx4.la
     src_libmlx4_la_SOURCES = $(MLX4_SOURCES)
     src_libmlx4_la_LDFLAGS = -avoid-version -release @IBV_DEVICE_LIBRARY_EXTENSION@ \
         $(mlx4_version_script)
@@ -14,13 +21,14 @@
     mlx4conf_DATA = mlx4.driver
 else
     mlx4libdir = $(libdir)/infiniband
-    mlx4lib_LTLIBRARIES = src/mlx4.la
+    mlx4lib_LTLIBRARIES += src/mlx4.la
     src_mlx4_la_SOURCES = $(MLX4_SOURCES)
     src_mlx4_la_LDFLAGS = -avoid-version -module $(mlx4_version_script)
 endif
 
-EXTRA_DIST = src/doorbell.h src/mlx4.h src/mlx4-abi.h src/wqe.h \
-    src/mlx4.map libmlx4.spec.in mlx4.driver
+EXTRA_DIST = src/mlx4.map libmlx4.spec.in mlx4.driver
+EXTRA_DIST += debian
+EXTRA_DIST += autogen.sh
 
 dist-hook: libmlx4.spec
 	cp libmlx4.spec $(distdir)
Index: contrib/ofed/libmlx4/autogen.sh
===================================================================
--- contrib/ofed/libmlx4/autogen.sh
+++ contrib/ofed/libmlx4/autogen.sh
@@ -1,4 +1,4 @@
-#! /bin/sh
+#! /bin/sh -eE
 
 set -x
 aclocal -I config
Index: contrib/ofed/libmlx4/configure.ac
===================================================================
--- contrib/ofed/libmlx4/configure.ac
+++ contrib/ofed/libmlx4/configure.ac
@@ -1,12 +1,15 @@
 dnl Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.57)
-AC_INIT(libmlx4, 1.0, general@lists.openfabrics.org)
+AC_INIT(libmlx4, 1.0.6mlnx1, linux-rdma@vger.kernel.org)
 AC_CONFIG_SRCDIR([src/mlx4.h])
 AC_CONFIG_AUX_DIR(config)
-AM_CONFIG_HEADER(config.h)
-AM_INIT_AUTOMAKE(libmlx4, 1.0)
-AM_PROG_LIBTOOL
+AC_CONFIG_HEADER(config.h)
+AM_INIT_AUTOMAKE([1.10 foreign tar-ustar silent-rules subdir-objects])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AC_PROG_LIBTOOL
+LT_INIT
 
 AC_ARG_WITH([valgrind],
     AC_HELP_STRING([--with-valgrind],
@@ -21,6 +24,13 @@
     fi
 fi
 
+#--with-wqe-format
+AC_ARG_WITH([wqe-format],
+    AC_HELP_STRING([--with-wqe-format],
+        [Enable wqe-format annotations (default NO)]),
+    AC_DEFINE([MLX4_WQE_FORMAT], 1, [Define to 1 to enable wqe-foramt annotations.]),
+)
+
 dnl Checks for programs
 AC_PROG_CC
 
@@ -32,22 +42,19 @@
 AC_CHECK_HEADER(infiniband/driver.h, [],
     AC_MSG_ERROR([<infiniband/driver.h> not found.  libmlx4 requires libibverbs.]))
 AC_HEADER_STDC
-AC_CHECK_HEADER(valgrind/memcheck.h,
-    [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1,
-        [Define to 1 if you have the <valgrind/memcheck.h> header file.])],
-    [if test $want_valgrind = yes; then
-        AC_MSG_ERROR([Valgrind memcheck support requested, but <valgrind/memcheck.h> not found.])
-    fi])
+
+if test x$want_valgrind = xyes; then
+	AC_CHECK_HEADER(valgrind/memcheck.h,
+	    [AC_DEFINE(HAVE_VALGRIND_MEMCHECK_H, 1,
+		[Define to 1 if you have the <valgrind/memcheck.h> header file.])],
+	    [if test $want_valgrind = yes; then
+		AC_MSG_ERROR([Valgrind memcheck support requested, but <valgrind/memcheck.h> not found.])
+	    fi])
+fi
 
 dnl Checks for typedefs, structures, and compiler characteristics.
 AC_C_CONST
 AC_CHECK_SIZEOF(long)
-AC_CHECK_MEMBER(struct ibv_context.more_ops,
-    [AC_DEFINE([HAVE_IBV_MORE_OPS], 1, [Define to 1 if more_ops is a member of ibv_context])],,
-    [#include <infiniband/verbs.h>])
-AC_CHECK_MEMBER(struct ibv_more_ops.create_xrc_srq,
-    [AC_DEFINE([HAVE_IBV_XRC_OPS], 1, [Define to 1 if have xrc ops])],,
-    [#include <infiniband/verbs.h>])
 
 dnl Checks for library functions
 AC_CHECK_FUNC(ibv_read_sysfs_file, [],
Index: contrib/ofed/libmlx4/debian/changelog
===================================================================
--- contrib/ofed/libmlx4/debian/changelog
+++ contrib/ofed/libmlx4/debian/changelog
@@ -1,8 +1,201 @@
-libmlx4 (1.0-2) unstable; urgency=low
+libmlx4 (1.0.6mlnx1-1) unstable; urgency=low
 
-  * Add debian/watch file
+  * libmlx4: Fix MR address change in rereg_mr
+  * libmlx4: revert the endianess fix for immediate data
+  * libmlx4: split post_send_one to qp types
+  * libmlx4: Add post_send_one to qp struct
+  * libmlx4: remove inl from basic set_data_seg functions
+  * libmlx4: Set data segment in one function
+  * libmlx4: set ctrl segment in one funtion
+  * libmlx4: use htonl when copy immediate data to WQE
+  * libmlx4: fix bug in bf_buf_size update
+  * libmlx4: Define set_data_seg as inline function
+  * libmlx4: reduce cache used by datapath
+  * libmlx4: optimize wq_overflow
+  * libmlx4: Add anothe DB ringing method
+  * libmlx4: Use x86_64 SSE2 instructions to improve bf_copy
+  * libmlx4: Add new DB ringing mode
+  * libmlx4: use all 8 BFs
+  * libmlx4: split ring_db function
+  * libmlx4: add door-bell ring function
+  * Modify call from ibv_exp_getenv to ibv_exp_cmd_getenv
+  * libmlx4: fix contiguous page registration
+  * Modify to use verbs specific getenv
+  * libmlx4: avoid creating AH with DLID 0
+  * libmlx4: fixed resize cq overrun bug
+  * libmlx4.spec.in: Changed valgrind libs DESTDIR
+  * Added valgrind support
+  * fixed and added valgrind Macros
+  * Adding experimental dereg_mr support
+  * shared_mr: handle duplication from glob/procfs
+  * shared_mr: fine-tuned counter mode name
+  * fix 32 bit compile warning
+  * shared mr with counter name support
+  * libmlx4: allow user to specify the addr of contig pages.
+  * libmlx4: avoid using gettimeofday in mlx4_reg_shared_mr.
+  * libmlx4: init exp_mw_bind.
+  * libmlx4: added -Werror to Makefile
+  * ibmlx4: Use masked atomics only if max_atomic_arg defined
+  * wc_flags should be set even when using experimental verbs
+  * libmlx4: return errno  on ibv_post_srq_recv
+  * libmlx4: Retry open shared mr file
+  * libmlx4: Add completion opcodes for masked atomic operations
+  * Verify hop_limit > 1 in create_ah
+  * libmlx4.spec.in: Support configure_options flag.
+  * configure: Update AM_INIT_AUTOMAKE to support new auto tools.
+  * Add MR re-registeration
+  * mlx4: Add support for timestamping when initiating context.
+  * libmlx4: Do not publish support for IBV_CALC_OP_MAXLOC
+  * Fix comp_mask handling in ibv_exp_query_values
+  * libmlx4: Simplify extended atomics API
+  * libmlx4: Fix wrong wqe pointer advance
+  * libmlx4: Add support for masked atomics
+  * Revert "libmlx4: Fix log function to avoid overflow"
+  * libmlx4: add ibv_exp_modify_qp to mlx4
+  * libmlx4: Fix overflow on flag mask
+  * libmlx4: Fix log function to avoid overflow
+  * libmlx4: improve experimental interface
+  * A correct AH was free'd by mistake
+  * Align create_ah_ex and query_port_ex to upstream
+  * Change imm_data to ex.imm_data or ex.invalidate_rkey
+  * libmlx4: change wc_size from int to uint32_t.
+  * libmlx4: Print prefer_bf message only in trace mode.
+  * libmlx4: separate mlx4_post_send to EXP & NON EXP
 
- -- Roland Dreier <rolandd@cisco.com>  Wed, 12 Mar 2008 10:40:19 -0700
+ -- Vladimir Sokolovsky <vlad@mellanox.com>  Wed, 10 Dec 2014 10:53:10 +0200
+
+libmlx4 (1.0.5mlnx1-1) unstable; urgency=low
+
+  * resize_cq: fix possible endless loop scanning CQ
+  * User QP/SRQ in work completion
+  * libmlx4: Align verbs interface with upstream
+  * libmlx4: add ibv_exp_reg_mr experimental verb
+  * libmlx4: Change legacy extended verbs to experimental verbs
+  * libmlx4: Change legacy extended uverbs to experimental uverbs
+  * unmap hca_clock_page in mlx4_uninit_context
+  * Enable contigous pages for Control resources by default
+  * New experimental verbs for query_port
+  * Added htobe64 definition which is missing on SLES10
+  * Fix QoS issues for UD QPs
+  * Allocate zoeroized memory for CQ
+  * libmlx4: Change sandy bridge work around algorithm
+  * libmlx4: add debian to EXTRA_DIST
+  * libmlx4: add support for "git review" command line gerrit tool
+  * libmlx4: Fix "make distcheck"
+  * Add allowed_wc_flags
+  * libmlx4: Fix valgrind errors.
+  * Raw IB QP fix
+  * libmlx4: Change inline receive interface
+  * Revert "move flow steering to experimental verbs"
+  * move flow steering to experimental verbs
+  * libmlx4: resolve segfault on ibv_xsrq_pingpong
+  * Raw Eth QP - prevent loopback on SRIOV
+  * libmlx4: remove struct ts and use direct field timestamp
+  * Fix compilation issue due to shifting bind_mw struct in ib_send_wr
+  * libmlx4: Add experimental inline receive
+  * Double check in order to prevent division by zero.
+  * Add a missing check for a value of a certain variable
+  * libmlx4 - qp: optimize single segment case around set_data_seg()
+  * libmlx4 - Inform GCC about hotspot functions so those can be optimized more aggressively.
+  * libmlx4 - Add branch prediction helpers to qp and cq data path functions.
+  * libmlx4 - Using unsigned indices allow GCC to generate a bit more efficient code.
+  * IP based addressing support
+  * Implementing verbs bind_mw (for binding type 1 memory windows)
+  * Adding support to post bind (type 2) memory windows
+  * Adding support to post invalidate messages
+  * Implementing verbs alloc_mw and dealloc_mw
+  * Adding work completions that are related to memory windows
+  * fix incorrect timestamp
+  * add a workaround for hw bug in hwclock wraparound
+  * extension verb: mlx4_query_values are reading hwclock
+  * extension verb: mlx4_query_device_ex
+  * extension verb: mlx4_create_cq_ex
+  * implement ibv_poll_cq_ex extension verb
+  * XRC - move warning to be under trace mode
+  * XRC - fix leak in legacy flow
+  * libmlx4 : Globaly avoid spinlocks for multithreaded apps
+  * Handle missing symbols in Xen server 6.1
+  * libmlx4: Cache link layer's type in mlx4_context. Caching will allow us to avoid ibv_query_port calls and save time in ibv_create_ah.
+  * XRC - sync to latest upstream changes
+  * XRC issues
+  * libmlx4: XRC binary compat layer
+
+ -- Vladimir Sokolovsky <vlad@mellanox.com>  Sun, 23 Mar 2014 14:16:10 +0200
+
+libmlx4 (1.0.4mlnx2-1) unstable; urgency=low
+
+  * libmlx4: Add Cross-channel capability
+  * libmlx4: Add mlx4_post_task
+  * libmlx4: Add mlx4_query_device_ex
+  * libmlx4: Add mlx4_modify_cq
+  * libmlx4: Support Cross-channel capability in mlx4_create_qp_ex
+  * libmlx4: Add new fields and opcodes to support Cross-channel
+  * libmlx4: Remove legacy mverbs code
+  * libmlx4: Add support for XRC QPs
+  * libmlx4: contig pages over 4GB
+  * stall code to be run only on x86
+  * Implement ibv_create_flow and ibv_destroy_flow
+  * Revert "Add support for ibv_attach_flow and ibv_detach_flow."
+  * libmlx4 fix compilation warnings
+  * Handle 0-length s/g list entries correctly
+  * libmlx4.spec.in: Fix %files macro
+  * configure: disable mverbs by default
+  * libmlx4: verbs extensions breaks MVERBS implementation
+  * shared_mr support on top of verbs extension
+  * libmlx4: Infra-structure changes to support verbs extensions
+  * fixed an issue with definition of container_of
+  * Revert "verbs extension mechanism based on Sean first patch"
+
+ -- Vladimir Sokolovsky <vlad@mellanox.com>  Mon, 7 Jan 2013 13:38:10 +0200
+
+libmlx4 (1.0.4mlnx1-1) unstable; urgency=low
+
+  * New Mellanox release.
+
+ -- Vladimir Sokolovsky <vlad@mellanox.com>  Mon, 7 Jan 2013 13:38:10 +0200
+
+libmlx4 (1.0.4-1) unstable; urgency=low
+
+  * New upstream release.
+    - IBoE multicast support.
+  * Update maintainer and remove DM-Upload-Allowed now that I'm a DD.
+
+ -- Roland Dreier <rbd@debian.org>  Wed, 28 Mar 2012 10:31:52 -0700
+
+libmlx4 (1.0.3-1) unstable; urgency=low
+
+  * New upstream release.
+    - Add ConnectX-3 support.
+    - Add IBoE support.
+  * Since we have plugin in /usr/lib/libibverbs, we need to depend on
+    libibverbs (>= 1.1.3).
+
+ -- Roland Dreier <roland@digitalvampire.org>  Wed, 06 Jul 2011 23:54:24 -0700
+
+libmlx4 (1.0.2-1) unstable; urgency=low
+
+  * New upstream release.
+    - Fix potential problems running under Valgrind.
+    - Add support for resize CQ operation.
+    - Fix other minor bugs.
+  * Update maintainer and set DM-Upload-Allowed to yes.  (Closes: #632108)
+  * Switch to dpkg-source 3.0 (quilt) format.
+  * Acknowledge NMU (Closes: #621664).
+  * Change build system from cdbs to debhelper 7.
+  * Use libibverbs 1.1.3 feature to move plugin to /usr/lib/libibverbs
+    to fix multiple problems with a not-exactly-shlib in /usr/lib.
+  * Add debian/watch file.
+  * Move -dbg package to section debug.
+  * Update to Standards-Version: 3.9.2.
+
+ -- Roland Dreier <roland@digitalvampire.org>  Wed, 06 Jul 2011 13:32:18 -0700
+
+libmlx4 (1.0-1.1) unstable; urgency=low
+
+  * Non-maintainer upload.
+  * Don't ship .la files (Closes: #621664).
+
+ -- Luk Claes <luk@debian.org>  Fri, 01 Jul 2011 19:09:59 +0200
 
 libmlx4 (1.0-1) unstable; urgency=low
 
Index: contrib/ofed/libmlx4/debian/compat
===================================================================
--- contrib/ofed/libmlx4/debian/compat
+++ contrib/ofed/libmlx4/debian/compat
@@ -1 +1 @@
-5
+7
Index: contrib/ofed/libmlx4/debian/control
===================================================================
--- contrib/ofed/libmlx4/debian/control
+++ contrib/ofed/libmlx4/debian/control
@@ -1,16 +1,16 @@
 Source: libmlx4
 Priority: extra
-Maintainer: Roland Dreier <rolandd@cisco.com>
-Build-Depends: @cdbs@, libibverbs-dev (>= 1.0)
-Standards-Version: 3.7.3
+Maintainer: Roland Dreier <rbd@debian.org>
+Build-Depends: debhelper (>= 7.0.50~), dpkg-dev (>= 1.13.19), libibverbs-dev (>= 1.1.3)
+Standards-Version: 3.9.2
 Section: libs
 Homepage: http://www.openfabrics.org/
 
 Package: libmlx4-1
 Section: libs
 Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: A userspace driver for Mellanox ConnectX InfiniBand HCAs
+Depends: ${shlibs:Depends}, ${misc:Depends}, libibverbs1 (>= 1.1.3)
+Description: Userspace driver for Mellanox ConnectX InfiniBand HCAs
  libmlx4 is a device-specific driver for Mellanox ConnectX InfiniBand
  host channel adapters (HCAs) for the libibverbs library.  This allows
  userspace processes to access Mellanox HCA hardware directly with
@@ -32,7 +32,7 @@
  directly to an application, which may be useful for debugging.
 
 Package: libmlx4-1-dbg
-Section: libdevel
+Section: debug
 Priority: extra
 Architecture: any
 Depends: ${misc:Depends}, libmlx4-1 (= ${binary:Version})
Index: contrib/ofed/libmlx4/debian/libmlx4-1.install
===================================================================
--- contrib/ofed/libmlx4/debian/libmlx4-1.install
+++ contrib/ofed/libmlx4/debian/libmlx4-1.install
@@ -1,2 +1,2 @@
-usr/lib/libmlx4-rdmav2.so
+usr/lib/libmlx4-rdmav2.so		/usr/lib/libibverbs/
 etc/libibverbs.d/mlx4.driver
Index: contrib/ofed/libmlx4/debian/libmlx4-dev.install
===================================================================
--- contrib/ofed/libmlx4/debian/libmlx4-dev.install
+++ contrib/ofed/libmlx4/debian/libmlx4-dev.install
@@ -1 +1 @@
-usr/lib/libmlx4.{a,la}
+usr/lib/libmlx4.a
Index: contrib/ofed/libmlx4/debian/rules
===================================================================
--- contrib/ofed/libmlx4/debian/rules
+++ contrib/ofed/libmlx4/debian/rules
@@ -1,8 +1,10 @@
 #!/usr/bin/make -f
 # -*- mode: makefile; coding: utf-8 -*-
 
-DEB_DH_INSTALL_SOURCEDIR := debian/tmp
-DEB_AUTO_UPDATE_LIBTOOL := post
+%:
+	dh $@
 
-include /usr/share/cdbs/1/rules/debhelper.mk
-include /usr/share/cdbs/1/class/autotools.mk
+override_dh_strip:
+	dh_strip --dbg-package=libmlx4-1-dbg
+
+override_dh_makeshlibs:
Index: contrib/ofed/libmlx4/libmlx4.spec.in
===================================================================
--- contrib/ofed/libmlx4/libmlx4.spec.in
+++ contrib/ofed/libmlx4/libmlx4.spec.in
@@ -1,15 +1,27 @@
+%{!?_with_valgrind: %define _with_valgrind 0}
+%{!?_disable_valgrind: %define _disable_valgrind 0}
+
+%if 0%{?rhel} == 6
+%if 0%{_disable_valgrind} == 0
+%define _with_valgrind 1
+%endif
+%endif
+
 Name: libmlx4
-Version: 1.0
-Release: 2%{?dist}
+Version: 1.0.6mlnx1
+Release: 1%{?dist}
 Summary: Mellanox ConnectX InfiniBand HCA Userspace Driver
 
 Group: System Environment/Libraries
 License: GPLv2 or BSD
 Url: http://openfabrics.org/
-Source: http://openfabrics.org/downloads/mlx4/libmlx4-1.0.tar.gz
+Source: http://openfabrics.org/downloads/mlx4/libmlx4-%{version}.tar.gz
 BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
 
-BuildRequires: libibverbs-devel >= 1.1-0.1.rc2
+BuildRequires: libibverbs-devel >= 1.1.6mlnx2
+%if %{_with_valgrind}
+BuildRequires: valgrind-devel
+%endif
 
 %description
 libmlx4 provides a device-specific userspace driver for Mellanox
@@ -29,12 +41,24 @@
 %setup -q -n %{name}-@VERSION@
 
 %build
-%configure
+%if %{_with_valgrind}
+%configure %{?configure_options} --libdir=%{_libdir}/mlnx_ofed/valgrind --with-valgrind
+make %{?_smp_mflags}
+make DESTDIR=$RPM_BUILD_DIR/%{name}-%{version}/valgrind install
+rm -f $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind/*.*a
+make clean
+%endif
+
+%configure %{?configure_options}
 make %{?_smp_mflags}
 
 %install
 rm -rf $RPM_BUILD_ROOT
 make DESTDIR=%{buildroot} install
+%if %{_with_valgrind}
+mkdir -p %{buildroot}/%{_libdir}/mlnx_ofed
+cp -a $RPM_BUILD_DIR/%{name}-%{version}/valgrind/%{_libdir}/mlnx_ofed/valgrind %{buildroot}/%{_libdir}/mlnx_ofed
+%endif
 # remove unpackaged files from the buildroot
 rm -f $RPM_BUILD_ROOT%{_libdir}/*.la $RPM_BUILD_ROOT%{_libdir}/libmlx4.so
 
@@ -43,15 +67,34 @@
 
 %files
 %defattr(-,root,root,-)
-%{_libdir}/libmlx4-rdmav2.so
+%{_libdir}/libmlx4*.so
+%if %{_with_valgrind}
+%{_libdir}/mlnx_ofed/valgrind/libmlx4*.so
+%endif
 %{_sysconfdir}/libibverbs.d/mlx4.driver
 %doc AUTHORS COPYING README
 
 %files devel
 %defattr(-,root,root,-)
-%{_libdir}/libmlx4.a
+%{_libdir}/libmlx4*.a
 
 %changelog
+* Mon Mar 28 2012 Roland Dreier <roland@digitalvampire.org> - 1.0.4-1
+- New upstream release
+
+* Mon Mar 26 2012 Roland Dreier <roland@digitalvampire.org> - 1.0.3-1
+- New upstream release
+
+* Wed Jul 6 2011 Roland Dreier <roland@digitalvampire.org> - 1.0.2-1
+- New upstream release
+
+* Wed Jun 17 2009 Roland Dreier <rdreier@cisco.com> - 1.0.1-1
+- New upstream release
+- Change openib.org URLs to openfabrics.org URLs
+
+* Wed Feb 25 2009 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 1.0-3
+- Rebuilt for https://fedoraproject.org/wiki/Fedora_11_Mass_Rebuild
+
 * Sun Jan 27 2008 Roland Dreier <rdreier@cisco.com> - 1.0-2
 - Spec file cleanups, based on Fedora review: don't mark
   libmlx4.driver as a config file, since it is not user modifiable,
Index: contrib/ofed/libmlx4/src/bitmap.h
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/bitmap.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2000, 2011 Mellanox Technology Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef BITMAP_H
+#define BITMAP_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+#ifndef min
+#define min(a, b)		\
+	({ typeof(a) _a = (a);	\
+	   typeof(b) _b = (b);	\
+	   _a < _b ? _a : _b; })
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define MLX4_SHM_ADDR (void *)(0x8000000000000000UL)
+#define MLX4_SHMAT_FLAGS (SHM_RND)
+#else
+#define MLX4_SHM_ADDR (void *)(0x0UL)
+#define MLX4_SHMAT_FLAGS (0)
+#endif
+
+struct __dummy_h { unsigned long a[100]; };
+#define MLX4_ADDR (*(struct __dummy_h *) addr)
+#define MLX4_CONST_ADDR (*(const struct __dummy_h *) addr)
+
+#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
+#define BITS_PER_BYTE		8
+#define BITS_PER_WORD		(BITS_PER_BYTE * sizeof(uint32_t))
+#define BITS_TO_WORDS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(uint32_t))
+
+#ifndef HPAGE_SIZE
+#define HPAGE_SIZE		(2UL*1024*1024)
+#endif
+
+#define MLX4_SHM_LENGTH		(HPAGE_SIZE)
+#define MLX4_Q_CHUNK_SIZE	32768
+#define MLX4_SHM_NUM_REGION	64
+
+struct mlx4_bitmap {
+	uint32_t		last;
+	uint32_t		top;
+	uint32_t		max;
+	uint32_t		avail;
+	uint32_t		mask;
+	struct mlx4_spinlock	lock;
+	uint32_t		*table;
+};
+
+inline unsigned long mlx4_ffz(uint32_t word)
+{
+	return __builtin_ffs(~word) - 1;
+}
+
+inline void mlx4_set_bit(unsigned int nr, uint32_t *addr)
+{
+
+	addr[(nr / BITS_PER_WORD)]
+	|= (1 << (nr % BITS_PER_WORD));
+
+
+}
+
+inline void mlx4_clear_bit(unsigned int nr,  uint32_t *addr)
+{
+	addr[(nr / BITS_PER_WORD)]
+		&= ~(1 << (nr % BITS_PER_WORD));
+}
+
+inline int mlx4_test_bit(unsigned int nr, const uint32_t *addr)
+{
+	return !!(addr[(nr / BITS_PER_WORD)]
+		& (1 <<  (nr % BITS_PER_WORD)));
+}
+
+inline uint32_t mlx4_find_first_zero_bit(const uint32_t *addr,
+						uint32_t size)
+{
+	const uint32_t *p = addr;
+	uint32_t result = 0;
+	uint32_t tmp;
+
+	while (size & ~(BITS_PER_WORD - 1)) {
+		tmp = *(p++);
+		if (~tmp)
+			goto found;
+		result += BITS_PER_WORD;
+		size -= BITS_PER_WORD;
+	}
+	if (!size)
+		return result;
+
+	tmp = (*p) | (~0UL << size);
+	if (tmp == (uint32_t)~0UL)	/* Are any bits zero? */
+		return result + size;	/* Nope. */
+found:
+	return result + mlx4_ffz(tmp);
+}
+
+int mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+	uint32_t obj;
+	int ret;
+
+	mlx4_spin_lock(&bitmap->lock);
+
+	obj = mlx4_find_first_zero_bit(bitmap->table, bitmap->max);
+	if (obj < bitmap->max) {
+		mlx4_set_bit(obj, bitmap->table);
+		bitmap->last = (obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
+		obj |= bitmap->top;
+		ret = obj;
+	} else
+		ret = -1;
+
+	if (ret != -1)
+		--bitmap->avail;
+
+	mlx4_spin_unlock(&bitmap->lock);
+
+	return ret;
+}
+
+static inline uint32_t find_aligned_range(uint32_t *bitmap,
+					uint32_t start, uint32_t nbits,
+					int len, int alignment)
+{
+	uint32_t end, i;
+
+again:
+	start = align(start, alignment);
+
+	while ((start < nbits) && mlx4_test_bit(start, bitmap))
+		start += alignment;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start + len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (mlx4_test_bit(i, bitmap)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+static inline int mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+					int align)
+{
+	uint32_t obj;
+	int ret, i;
+
+	if (cnt == 1 && align == 1)
+		return mlx4_bitmap_alloc(bitmap);
+
+	if (cnt > bitmap->max)
+		return -1;
+
+	mlx4_spin_lock(&bitmap->lock);
+
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+					 cnt, align);
+	}
+
+	if (obj < bitmap->max) {
+		for (i = 0; i < cnt; i++)
+			mlx4_set_bit(obj + i, bitmap->table);
+		if (obj == bitmap->last) {
+			bitmap->last = (obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		obj |= bitmap->top;
+		ret = obj;
+	} else
+		ret = -1;
+
+	if (ret != -1)
+		bitmap->avail -= cnt;
+
+	mlx4_spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+static inline void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, uint32_t obj,
+					int cnt)
+{
+	int i;
+
+	obj &= bitmap->max - 1;
+
+	mlx4_spin_lock(&bitmap->lock);
+	for (i = 0; i < cnt; i++)
+		mlx4_clear_bit(obj + i, bitmap->table);
+	bitmap->last = min(bitmap->last, obj);
+	bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask;
+	bitmap->avail += cnt;
+	mlx4_spin_unlock(&bitmap->lock);
+}
+
+static inline int is_bitmap_empty(struct mlx4_bitmap *bitmap)
+{
+	int ret;
+
+	mlx4_spin_lock(&bitmap->lock);
+	ret = (bitmap->avail == bitmap->max) ? 1 : 0;
+	mlx4_spin_unlock(&bitmap->lock);
+
+	return ret;
+}
+
+static inline int is_bitmap_avail(struct mlx4_bitmap *bitmap)
+{
+	int ret;
+
+	mlx4_spin_lock(&bitmap->lock);
+	ret = (bitmap->avail > 0) ? 1 : 0;
+	mlx4_spin_unlock(&bitmap->lock);
+
+	return ret;
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, uint32_t num, uint32_t mask)
+{
+	bitmap->last = 0;
+	bitmap->top  = 0;
+	bitmap->max  = bitmap->avail = num;
+	bitmap->mask = mask;
+	bitmap->avail = bitmap->max;
+	mlx4_spinlock_init(&bitmap->lock, !mlx4_single_threaded);
+	bitmap->table = malloc(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t));
+
+	if (!bitmap->table)
+		return -ENOMEM;
+	memset((void *)bitmap->table, 0,
+		(int)(BITS_TO_WORDS(bitmap->max) * sizeof(uint32_t)));
+	return 0;
+}
+
+inline void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+	if (bitmap->table)
+		free(bitmap->table);
+}
+
+static inline void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, uint32_t obj)
+{
+	mlx4_bitmap_free_range(bitmap, obj, 1);
+}
+
+#endif
Index: contrib/ofed/libmlx4/src/buf.c
===================================================================
--- contrib/ofed/libmlx4/src/buf.c
+++ contrib/ofed/libmlx4/src/buf.c
@@ -36,9 +36,21 @@
 
 #include <stdlib.h>
 #include <errno.h>
+#include <signal.h>
 #include <sys/mman.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <stdio.h>
 
 #include "mlx4.h"
+#include "bitmap.h"
+
+struct mlx4_hugetlb_mem {
+	int			shmid;
+	char		       *shmaddr;
+	struct mlx4_bitmap	bitmap;
+	struct list_head	list;
+};
 
 #if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE))
 
@@ -59,13 +71,154 @@
 
 #endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */
 
+void mlx4_hugetlb_mem_free(struct mlx4_hugetlb_mem *hmem)
+{
+	mlx4_bitmap_cleanup(&hmem->bitmap);
+
+	if (shmdt((const void *)hmem->shmaddr) != 0) {
+		if (mlx4_trace)			
+			perror("Detach shm failure");
+	}
+	free(hmem);
+}
+static void mlx4_free_buf_huge_ex(struct mlx4_context *mctx,
+					struct mlx4_buf *buf,
+					int do_fork)
+{
+	struct mlx4_hugetlb_mem *hmem;
+
+	if (do_fork)
+		ibv_dofork_range(buf->buf, buf->length);
+
+	if (buf->hmem == NULL) {
+		if (mlx4_trace)
+			perror("No hugetlb mem");
+		return;
+	}
+
+	hmem = (struct mlx4_hugetlb_mem *) buf->hmem;
+	mlx4_spin_lock(&mctx->hugetlb_lock);
+	mlx4_bitmap_free_range(&hmem->bitmap, buf->base,
+			       buf->length/MLX4_Q_CHUNK_SIZE);
+
+	if (is_bitmap_empty(&hmem->bitmap)) {
+		list_del(&hmem->list);
+		mlx4_hugetlb_mem_free(hmem);
+	}
+	mlx4_spin_unlock(&mctx->hugetlb_lock);
+}
+
+void mlx4_free_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf)
+{
+	mlx4_free_buf_huge_ex(mctx, buf, 1);
+}
+
+#ifndef SHM_HUGETLB
+#define	SHM_HUGETLB 0
+#endif
+
+struct mlx4_hugetlb_mem *mxl4_hugetlb_mem_alloc(size_t size)
+{
+	struct mlx4_hugetlb_mem *hmem;
+	size_t shm_len;
+
+	hmem = malloc(sizeof(*hmem));
+	if (!hmem)
+		return NULL;
+	
+	shm_len = (size > MLX4_SHM_LENGTH) ? align(size, MLX4_SHM_LENGTH) :
+			MLX4_SHM_LENGTH;
+	hmem->shmid = shmget(IPC_PRIVATE, shm_len,
+			     SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+	if (hmem->shmid < 0) {
+		if (mlx4_trace)
+			perror("shmget");
+		free(hmem);
+		return NULL;
+	}
+
+	hmem->shmaddr = shmat(hmem->shmid, MLX4_SHM_ADDR, MLX4_SHMAT_FLAGS);
+	if (hmem->shmaddr == (char *)-1) {
+		if (mlx4_trace)
+			perror("Shared memory attach failure");
+		shmctl(hmem->shmid, IPC_RMID, NULL);
+		free(hmem);
+		return NULL;
+	}
+
+	if (mlx4_bitmap_init(&hmem->bitmap, shm_len/MLX4_Q_CHUNK_SIZE,
+			 shm_len/MLX4_Q_CHUNK_SIZE - 1)) {
+		if (mlx4_trace)
+			perror("mlx4_bitmap_init");
+		mlx4_hugetlb_mem_free(hmem);
+		return NULL;
+	}
+
+	/* Marked to destroy when process detaches from shmget segment */
+	shmctl(hmem->shmid, IPC_RMID, NULL);
+
+	return hmem;
+}
+
+
+int mlx4_alloc_prefered_buf(struct mlx4_context *mctx,
+				struct mlx4_buf *buf,
+				size_t size, int page_size,
+				enum mlx4_alloc_type alloc_type,
+				const char *component)
+{
+	int ret = 1;
+
+	buf->hmem = NULL;
+	/* Fallback mechanism is used below:
+	    priority is: huge pages , contig pages, default allocation */
+	if (alloc_type == MLX4_ALLOC_TYPE_HUGE ||
+		alloc_type == MLX4_ALLOC_TYPE_PREFER_HUGE ||
+		alloc_type == MLX4_ALLOC_TYPE_ALL) {
+		ret = mlx4_alloc_buf_huge(mctx, buf,
+					  size,
+					  page_size);
+		if (!ret)
+			return 0;
+
+		/* Checking whether HUGE is forced */
+		if (alloc_type == MLX4_ALLOC_TYPE_HUGE)
+			return -1;
+		if (mlx4_trace)
+			printf(PFX "Huge mode allocation has failed,fallback to %s mode\n",
+				MLX4_ALLOC_TYPE_ALL ? "contig" : "default");
+			
+	}
+
+	if (alloc_type == MLX4_ALLOC_TYPE_CONTIG ||
+		alloc_type == MLX4_ALLOC_TYPE_PREFER_CONTIG ||
+		alloc_type == MLX4_ALLOC_TYPE_ALL) {
+		ret = mlx4_alloc_buf_contig(mctx, buf,
+					  size,
+					  page_size,
+					  component, NULL);
+		if (!ret)
+			return 0;
+
+		/* Checking whether CONTIG is forced */
+		if (alloc_type == MLX4_ALLOC_TYPE_CONTIG)
+			return -1;
+		if (mlx4_trace)
+			printf(PFX "Contig mode allocation has failed,fallback to default mode\n");				
+	}
+
+	return mlx4_alloc_buf(buf, size, page_size);
+
+}
+
+
 int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
 {
 	int ret;
 
 	buf->length = align(size, page_size);
 	buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE,
-			MAP_PRIVATE | MAP_ANON, -1, 0);
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 	if (buf->buf == MAP_FAILED)
 		return errno;
 
@@ -78,6 +231,271 @@
 
 void mlx4_free_buf(struct mlx4_buf *buf)
 {
-	ibv_dofork_range(buf->buf, buf->length);
-	munmap(buf->buf, buf->length);
+	if (buf->length) {
+		ibv_dofork_range(buf->buf, buf->length);
+		munmap(buf->buf, buf->length);
+	}
+}
+
+/* This function computes log2(v) rounded up.
+*   We don't want to have a dependency to libm which exposes ceil & log2 APIs.
+*   Code was written based on public domain code:
+	URL: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog.
+*/
+static uint32_t mlx4_get_block_order(uint32_t v)
+{
+	static const uint32_t bits_arr[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+	static const uint32_t shift_arr[] = {1, 2, 4, 8, 16};
+	int i;
+	uint32_t input_val = v;
+
+	register uint32_t r = 0;/* result of log2(v) will go here */
+	for (i = 4; i >= 0; i--) {
+
+		if (v & bits_arr[i]) {
+			v >>= shift_arr[i];
+			r |= shift_arr[i];
+		}
+	}
+	/* Rounding up if required */
+	r += !!(input_val & ((1 << r) - 1));
+
+	return r;
+}
+
+
+static int mlx4_finalize_contiguous_alloc(struct mlx4_buf *buf,
+						void *addr,
+						size_t length)
+{
+	if (ibv_dontfork_range(addr, length)) {
+		munmap(addr, length);
+		return 1;
+	}
+
+	/* We hook addr & length also internally for further
+	     use via dreg_mr. On ibv_mr returned to user length or address may
+	     be different than the allocated length or address as of alignment
+	     issues.
+	*/
+	buf->buf = addr;
+	buf->length = length;
+	return 0;
+
+}
+
+
+void mlx4_get_alloc_type(struct ibv_context *context, const char *component,
+			 enum mlx4_alloc_type *alloc_type,
+			 enum mlx4_alloc_type default_alloc_type)
+
+{
+	char env_value[VERBS_MAX_ENV_VAL];
+	char name_buff[128];
+
+	sprintf(name_buff, "%s_ALLOC_TYPE", component);
+
+	/* First set defaults */
+	*alloc_type = default_alloc_type;
+
+	if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
+		if (!strcasecmp(env_value, "ANON"))
+			*alloc_type = MLX4_ALLOC_TYPE_ANON;
+		else if (!strcasecmp(env_value, "HUGE"))
+			*alloc_type = MLX4_ALLOC_TYPE_HUGE;
+		else if (!strcasecmp(env_value, "CONTIG"))
+			*alloc_type = MLX4_ALLOC_TYPE_CONTIG;
+		else if (!strcasecmp(env_value, "PREFER_CONTIG"))
+			*alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
+		else if (!strcasecmp(env_value, "PREFER_HUGE"))
+			*alloc_type = MLX4_ALLOC_TYPE_PREFER_HUGE;
+		else if (!strcasecmp(env_value, "ALL"))
+			*alloc_type = MLX4_ALLOC_TYPE_ALL;
+	}
+
+	return;
+}
+
+
+static void mlx4_alloc_get_env_info(struct ibv_context *context,
+				    int *max_log2_contig_block_size,
+				    int *min_log2_contig_block_size,
+				    const char *component)
+
+{
+	char env_value[VERBS_MAX_ENV_VAL];
+	int value;
+	char name_buff[128];
+
+	/* First set defaults */
+	*max_log2_contig_block_size = MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE;
+	*min_log2_contig_block_size = MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE;
+
+	sprintf(name_buff, "%s_MAX_LOG2_CONTIG_BSIZE", component);
+	if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
+		value = atoi(env_value);
+		if (value <= MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE &&
+		    value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE)
+			*max_log2_contig_block_size = value;
+		else
+			fprintf(stderr,
+			"Invalid value %d for %s\n",
+				value, name_buff);
+	}
+	sprintf(name_buff, "%s_MIN_LOG2_CONTIG_BSIZE", component);
+	if (!ibv_exp_cmd_getenv(context, name_buff, env_value, sizeof(env_value))) {
+		value = atoi(env_value);
+		if (value >= MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE &&
+		    value  <=  *max_log2_contig_block_size)
+			*min_log2_contig_block_size = value;
+		else
+			fprintf(stderr,
+			"Invalid value %d for %s\n",
+				value, name_buff);
+	}
+	return;
 }
+
+
+
+int mlx4_alloc_buf_contig(struct mlx4_context *mctx,
+				struct mlx4_buf *buf, size_t size,
+				int page_size,
+				const char *component, void *req_addr)
+{
+	void *addr = NULL;
+	int block_size_exp;
+	int max_log2_contig_block_size;
+	int min_log2_contig_block_size;
+	int mmap_flags = MAP_SHARED;
+	void *act_addr = NULL;
+	size_t act_size = size;
+
+	struct ibv_context *context = &(mctx->ibv_ctx);
+
+	mlx4_alloc_get_env_info(&mctx->ibv_ctx,
+				&max_log2_contig_block_size,
+				&min_log2_contig_block_size,
+				component);
+
+	/* Checking that we don't pass max block size */
+	if (size >= (1 << max_log2_contig_block_size))
+		block_size_exp = max_log2_contig_block_size;
+	else
+		block_size_exp = mlx4_get_block_order(size);
+
+	if (req_addr) {
+		act_addr = (void *)((uintptr_t)req_addr & ~((uintptr_t)page_size - 1));
+		act_size += (size_t)((uintptr_t)req_addr - (uintptr_t)act_addr);
+		mmap_flags |= MAP_FIXED;
+	}
+
+	do {
+		/* The second parameter holds the total required length for
+		     this contiguous allocation aligned to page size.
+		     When calling mmap the last offset parameter
+		     should be a multiple of the page size and holds:
+		     1) Indication that we are in that mode of
+			allocation contiguous memory (value #2)
+		     2) The required size of each block.
+			To enable future actions on mmap we
+			use the last 3 bits of the offset parameter
+			as the command identifier.
+		*/
+		addr = mmap(act_addr, act_size,
+				PROT_WRITE | PROT_READ, mmap_flags,
+				context->cmd_fd,
+				page_size *
+				(MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD +
+				(block_size_exp << MLX4_MMAP_CMD_BITS)));
+
+		/* On a failure  MAP_FAILED (that is, (void *) -1) is returned*/
+		if (addr != MAP_FAILED)
+			break;
+
+		/* We failed - set addr to NULL and checks whether
+		     a retry is relevant.
+		* If kernel doesn't support this command as of
+		   compatibility issues we'll also get EINVAL.
+		*/
+		addr = NULL;
+		if (errno == EINVAL)
+			break;
+
+		/* Retring asking for less contiguous pages per block */
+		block_size_exp -= 1;
+	} while (block_size_exp >= min_log2_contig_block_size);
+
+	if (!addr)
+		return 1;
+
+	/* All was ok we'll make final steps to have this addr ready*/
+	return mlx4_finalize_contiguous_alloc(buf, addr, act_size);
+}
+
+int mlx4_alloc_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf,
+			size_t size, int page_size)
+{
+	struct mlx4_hugetlb_mem *hmem, *tmp_hmem;
+	int found = 0;
+	int ret = 0;
+	LIST_HEAD(slist);
+
+	buf->length = align(size, MLX4_Q_CHUNK_SIZE);
+
+	mlx4_spin_lock(&mctx->hugetlb_lock);
+	list_for_each_entry_safe(hmem, tmp_hmem, &mctx->hugetlb_list, list) {
+		if (is_bitmap_avail(&hmem->bitmap)) {
+			buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap,
+					buf->length/MLX4_Q_CHUNK_SIZE, 1);
+			if (buf->base == -1)
+				continue;
+			else {
+				buf->hmem = (void *)hmem;
+				found = 1;
+				break;
+			}
+		}
+	}
+	mlx4_spin_unlock(&mctx->hugetlb_lock);
+
+	if (!found) {
+		int avail;
+
+		hmem = mxl4_hugetlb_mem_alloc(buf->length);
+		if (hmem == NULL)			
+			return -1;
+
+		buf->base = mlx4_bitmap_alloc_range(&hmem->bitmap,
+					buf->length/MLX4_Q_CHUNK_SIZE, 1);
+		if (buf->base == -1) {
+			if (mlx4_trace)
+				perror("mlx4_bitmap_alloc_range");
+			mlx4_hugetlb_mem_free(hmem);
+			return -1;
+		}
+
+		buf->hmem = (void *)hmem;
+
+		avail = is_bitmap_avail(&hmem->bitmap);
+		mlx4_spin_lock(&mctx->hugetlb_lock);
+		if (avail)
+			list_add(&hmem->list, &mctx->hugetlb_list);
+		else
+			list_add_tail(&hmem->list, &mctx->hugetlb_list);
+		mlx4_spin_unlock(&mctx->hugetlb_lock);
+	}
+
+	buf->buf = hmem->shmaddr + (buf->base * MLX4_Q_CHUNK_SIZE);
+
+	ret = ibv_dontfork_range(buf->buf, buf->length);
+	if (ret) {
+		mlx4_free_buf_huge_ex(mctx, buf, 0);
+		buf->hmem = NULL;
+		if (mlx4_trace)
+			perror("ibv_dontfork_range");
+	}
+
+	return ret;
+}
+
Index: contrib/ofed/libmlx4/src/cq.c
===================================================================
--- contrib/ofed/libmlx4/src/cq.c
+++ contrib/ofed/libmlx4/src/cq.c
@@ -47,6 +47,8 @@
 #include "mlx4.h"
 #include "doorbell.h"
 
+int mlx4_stall_num_loop = 300;
+
 enum {
 	MLX4_CQ_DOORBELL			= 0x20
 };
@@ -61,8 +63,18 @@
 #define MLX4_CQ_DB_REQ_NOT			(2 << 24)
 
 enum {
+	MLX4_CQE_L2_TUNNEL_IPV4			= 1 << 25,
+	MLX4_CQE_L2_TUNNEL_L4_CSUM		= 1 << 26,
+	MLX4_CQE_L2_TUNNEL			= 1 << 27,
+	MLX4_CQE_VLAN_PRESENT_MASK		= 1 << 29,
+	MLX4_CQE_L2_TUNNEL_IPOK			= 1 << 31,
+	MLX4_CQE_QPN_MASK			= 0xffffff,
+};
+
+enum {
 	MLX4_CQE_OWNER_MASK			= 0x80,
 	MLX4_CQE_IS_SEND_MASK			= 0x40,
+	MLX4_CQE_INL_SCATTER_MASK		= 0x20,
 	MLX4_CQE_OPCODE_MASK			= 0x1f
 };
 
@@ -82,23 +94,50 @@
 	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
 };
 
+enum {
+	MLX4_CQE_STATUS_L4_CSUM		= 1 << 2,
+	MLX4_CQE_STATUS_IPV4		= 1 << 6,
+	MLX4_CQE_STATUS_IPV4F		= 1 << 7,
+	MLX4_CQE_STATUS_IPV6		= 1 << 8,
+	MLX4_CQE_STATUS_IPV4OPT		= 1 << 9,
+	MLX4_CQE_STATUS_TCP		= 1 << 10,
+	MLX4_CQE_STATUS_UDP		= 1 << 11,
+	MLX4_CQE_STATUS_IPOK		= 1 << 12
+};
+
+
 struct mlx4_cqe {
-	uint32_t	my_qpn;
+	uint32_t	vlan_my_qpn;
 	uint32_t	immed_rss_invalid;
 	uint32_t	g_mlpath_rqpn;
-	uint8_t		sl;
-	uint8_t		reserved1;
-	uint16_t	rlid;
-	uint32_t	reserved2;
+	union {
+		struct {
+			union {
+				struct {
+					uint16_t  sl_vid;
+					uint16_t  rlid;
+				};
+				uint32_t  timestamp_16_47;
+			};
+			uint16_t  status;
+			uint8_t   reserved2;
+			uint8_t   badfcs_enc;
+		};
+		struct {
+			uint16_t reserved4;
+			uint8_t  smac[6];
+		};
+	};
 	uint32_t	byte_cnt;
 	uint16_t	wqe_index;
 	uint16_t	checksum;
-	uint8_t		reserved3[3];
+	uint8_t		reserved5[1];
+	uint16_t	timestamp_0_15;
 	uint8_t		owner_sr_opcode;
-};
+} __attribute__((packed));
 
 struct mlx4_err_cqe {
-	uint32_t	my_qpn;
+	uint32_t	vlan_my_qpn;
 	uint32_t	reserved1[5];
 	uint16_t	wqe_index;
 	uint8_t		vendor_err;
@@ -118,7 +157,7 @@
 	struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe;
 
 	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
-		!!(n & (cq->ibv_cq.cqe + 1))) ? NULL : tcqe;
+		!!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe;
 }
 
 static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq)
@@ -126,18 +165,13 @@
 	return get_sw_cqe(cq, cq->cons_index);
 }
 
-static void update_cons_index(struct mlx4_cq *cq)
-{
-	*cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
-}
-
 static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc)
 {
 	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR)
 		printf(PFX "local QP operation err "
 		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
 		       "opcode = %02x)\n",
-		       htonl(cqe->my_qpn), htonl(cqe->wqe_index),
+		       htonl(cqe->vlan_my_qpn), htonl(cqe->wqe_index),
 		       cqe->vendor_err,
 		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
 
@@ -191,22 +225,34 @@
 
 static int mlx4_poll_one(struct mlx4_cq *cq,
 			 struct mlx4_qp **cur_qp,
-			 struct ibv_wc *wc)
+			 struct ibv_exp_wc *wc,
+			 uint32_t wc_size, int is_exp)
 {
 	struct mlx4_wq *wq;
 	struct mlx4_cqe *cqe;
-	struct mlx4_srq *srq = NULL;
+	struct mlx4_srq *srq;
 	uint32_t qpn;
-	uint32_t srqn;
 	uint32_t g_mlpath_rqpn;
 	uint16_t wqe_index;
 	int is_error;
 	int is_send;
-
+	int size;
+	int left;
+	int list_len;
+	int i;
+	struct mlx4_inlr_rbuff *rbuffs;
+	uint8_t *sbuff;
+	int timestamp_en = !!(cq->creation_flags &
+			      IBV_EXP_CQ_TIMESTAMP);
+	uint64_t exp_wc_flags = 0;
+	uint64_t wc_flags = 0;
 	cqe = next_cqe_sw(cq);
 	if (!cqe)
 		return CQ_EMPTY;
 
+	if (cq->cqe_size == 64)
+		++cqe;
+
 	++cq->cons_index;
 
 	VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
@@ -217,36 +263,44 @@
 	 */
 	rmb();
 
-	qpn = ntohl(cqe->my_qpn);
+	qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+	wc->qp_num = qpn;
 
 	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+
+	/* include checksum as work around for calc opcode */
 	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
-		MLX4_CQE_OPCODE_ERROR;
+		MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff);
 
-	if (qpn & MLX4_XRC_QPN_BIT && !is_send) {
-		srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff;
+	if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) {
 		/*
-		 * We do not have to take the XRC SRQ table lock here,
-		 * because CQs will be locked while XRC SRQs are removed
+		 * We do not have to take the XSRQ table lock here,
+		 * because CQs will be locked while SRQs are removed
 		 * from the table.
 		 */
-		srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn);
+		*cur_qp = NULL;
+		srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table,
+				     ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
 		if (!srq)
 			return CQ_POLL_ERR;
-	} else if (!*cur_qp || (qpn & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) {
-		/*
-		 * We do not have to take the QP table lock here,
-		 * because CQs will be locked while QPs are removed
-		 * from the table.
-		 */
-		*cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context),
-				       qpn & 0xffffff);
-		if (!*cur_qp)
-			return CQ_POLL_ERR;
+	} else {
+		if (unlikely(!*cur_qp || (qpn != (*cur_qp)->verbs_qp.qp.qp_num))) {
+			/*
+			 * We do not have to take the QP table lock here,
+			 * because CQs will be locked while QPs are removed
+			 * from the table.
+			 */
+			*cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+			if (unlikely(!*cur_qp))
+				return CQ_POLL_ERR;
+		}
+		if (is_exp) {
+			wc->qp = &((*cur_qp)->verbs_qp.qp);
+			exp_wc_flags |= IBV_EXP_WC_QP;
+		}
+		srq = ((*cur_qp)->verbs_qp.qp.srq) ? to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL;
 	}
 
-	wc->qp_num = qpn & 0xffffff;
-
 	if (is_send) {
 		wq = &(*cur_qp)->sq;
 		wqe_index = ntohs(cqe->wqe_index);
@@ -257,112 +311,267 @@
 		wqe_index = htons(cqe->wqe_index);
 		wc->wr_id = srq->wrid[wqe_index];
 		mlx4_free_srq_wqe(srq, wqe_index);
-	} else if ((*cur_qp)->ibv_qp.srq) {
-		srq = to_msrq((*cur_qp)->ibv_qp.srq);
-		wqe_index = htons(cqe->wqe_index);
-		wc->wr_id = srq->wrid[wqe_index];
-		mlx4_free_srq_wqe(srq, wqe_index);
+		if (is_exp) {
+			wc->srq = &(srq->verbs_srq.srq);
+			exp_wc_flags |= IBV_EXP_WC_SRQ;
+		}
 	} else {
 		wq = &(*cur_qp)->rq;
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wqe_index = wq->tail & (wq->wqe_cnt - 1);
+		wc->wr_id = wq->wrid[wqe_index];
 		++wq->tail;
 	}
 
-	if (is_error) {
-		mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+	if (unlikely(is_error)) {
+		mlx4_handle_error_cqe((struct mlx4_err_cqe *)cqe,
+				      (struct ibv_wc *)wc);
 		return CQ_OK;
 	}
 
 	wc->status = IBV_WC_SUCCESS;
 
+	if (timestamp_en && offsetof(struct ibv_exp_wc, timestamp) < wc_size)  {
+		/* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is
+		 * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't
+		 * supported */
+		if (cq->creation_flags &
+		    IBV_EXP_CQ_TIMESTAMP_TO_SYS_TIME)
+			wc->timestamp = 0;
+		else {
+			wc->timestamp =
+				(uint64_t)(ntohl(cqe->timestamp_16_47) +
+					   !cqe->timestamp_0_15) << 16
+				| (uint64_t)ntohs(cqe->timestamp_0_15);
+			exp_wc_flags |= IBV_EXP_WC_WITH_TIMESTAMP;
+		}
+	}
+
 	if (is_send) {
-		wc->wc_flags = 0;
 		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_CALC_RDMA_WRITE_IMM:
 		case MLX4_OPCODE_RDMA_WRITE_IMM:
-			wc->wc_flags |= IBV_WC_WITH_IMM;
+			wc_flags |= IBV_WC_WITH_IMM;
 		case MLX4_OPCODE_RDMA_WRITE:
-			wc->opcode    = IBV_WC_RDMA_WRITE;
+			wc->exp_opcode    = IBV_EXP_WC_RDMA_WRITE;
 			break;
 		case MLX4_OPCODE_SEND_IMM:
-			wc->wc_flags |= IBV_WC_WITH_IMM;
+			wc_flags |= IBV_WC_WITH_IMM;
 		case MLX4_OPCODE_SEND:
-			wc->opcode    = IBV_WC_SEND;
+			wc->exp_opcode    = IBV_EXP_WC_SEND;
 			break;
 		case MLX4_OPCODE_RDMA_READ:
-			wc->opcode    = IBV_WC_RDMA_READ;
+			wc->exp_opcode    = IBV_EXP_WC_RDMA_READ;
 			wc->byte_len  = ntohl(cqe->byte_cnt);
 			break;
 		case MLX4_OPCODE_ATOMIC_CS:
-			wc->opcode    = IBV_WC_COMP_SWAP;
+			wc->exp_opcode    = IBV_EXP_WC_COMP_SWAP;
 			wc->byte_len  = 8;
 			break;
 		case MLX4_OPCODE_ATOMIC_FA:
-			wc->opcode    = IBV_WC_FETCH_ADD;
+			wc->exp_opcode    = IBV_EXP_WC_FETCH_ADD;
 			wc->byte_len  = 8;
 			break;
+		case MLX4_OPCODE_ATOMIC_MASK_CS:
+			wc->exp_opcode    = IBV_EXP_WC_MASKED_COMP_SWAP;
+			break;
+		case MLX4_OPCODE_ATOMIC_MASK_FA:
+			wc->exp_opcode    = IBV_EXP_WC_MASKED_FETCH_ADD;
+			break;
+		case MLX4_OPCODE_LOCAL_INVAL:
+			if (unlikely(!is_exp))
+				return CQ_POLL_ERR;
+			wc->exp_opcode    = IBV_EXP_WC_LOCAL_INV;
+			break;
+		case MLX4_OPCODE_SEND_INVAL:
+			wc->exp_opcode    = IBV_EXP_WC_SEND;
+			break;
 		case MLX4_OPCODE_BIND_MW:
-			wc->opcode    = IBV_WC_BIND_MW;
+			wc->exp_opcode    = IBV_EXP_WC_BIND_MW;
 			break;
 		default:
 			/* assume it's a send completion */
-			wc->opcode    = IBV_WC_SEND;
+			wc->exp_opcode    = IBV_EXP_WC_SEND;
 			break;
 		}
 	} else {
 		wc->byte_len = ntohl(cqe->byte_cnt);
+		if ((*cur_qp) && (*cur_qp)->max_inlr_sg &&
+		    (cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) {
+			rbuffs = (*cur_qp)->inlr_buff.buff[wqe_index].sg_list;
+			list_len = (*cur_qp)->inlr_buff.buff[wqe_index].list_len;
+			sbuff = mlx4_get_recv_wqe((*cur_qp), wqe_index);
+			left = wc->byte_len;
+			for (i = 0; (i < list_len) && left; i++) {
+				size = min(rbuffs->rlen, left);
+				memcpy(rbuffs->rbuff, sbuff, size);
+				left -= size;
+				rbuffs++;
+				sbuff += size;
+			}
+			if (left) {
+				wc->status = IBV_WC_LOC_LEN_ERR;
+				return CQ_OK;
+			}
+		}
 
 		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
 		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
-			wc->opcode   = IBV_WC_RECV_RDMA_WITH_IMM;
-			wc->wc_flags = IBV_WC_WITH_IMM;
+			wc->exp_opcode   = IBV_EXP_WC_RECV_RDMA_WITH_IMM;
+			wc_flags = IBV_WC_WITH_IMM;
 			wc->imm_data = cqe->immed_rss_invalid;
 			break;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+			if (unlikely(!is_exp))
+				return CQ_POLL_ERR;
+			wc->exp_opcode   = IBV_EXP_WC_RECV;
+			exp_wc_flags |= IBV_EXP_WC_WITH_INV;
+			wc->imm_data = ntohl(cqe->immed_rss_invalid);
+			break;
 		case MLX4_RECV_OPCODE_SEND:
-			wc->opcode   = IBV_WC_RECV;
-			wc->wc_flags = 0;
+			wc->exp_opcode   = IBV_EXP_WC_RECV;
+			wc_flags = 0;
 			break;
 		case MLX4_RECV_OPCODE_SEND_IMM:
-			wc->opcode   = IBV_WC_RECV;
-			wc->wc_flags = IBV_WC_WITH_IMM;
+			wc->exp_opcode   = IBV_EXP_WC_RECV;
+			wc_flags = IBV_WC_WITH_IMM;
 			wc->imm_data = cqe->immed_rss_invalid;
 			break;
 		}
 
-		wc->slid	   = ntohs(cqe->rlid);
-		wc->sl		   = cqe->sl >> 4;
+		if (!timestamp_en) {
+			exp_wc_flags |= IBV_EXP_WC_WITH_SLID;
+			wc->slid = ntohs(cqe->rlid);
+		}
 		g_mlpath_rqpn	   = ntohl(cqe->g_mlpath_rqpn);
 		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
 		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
-		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
+		wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
 		wc->pkey_index     = ntohl(cqe->immed_rss_invalid) & 0x7f;
+		/* When working with xrc srqs, don't have qp to check link layer.
+		  * Using IB SL, should consider Roce. (TBD)
+		*/
+		/* sl is invalid when timestamp is used */
+		if (!timestamp_en) {
+			if ((*cur_qp) && (*cur_qp)->link_layer ==
+			    IBV_LINK_LAYER_ETHERNET)
+				wc->sl = ntohs(cqe->sl_vid) >> 13;
+			else
+				wc->sl = ntohs(cqe->sl_vid) >> 12;
+			exp_wc_flags |= IBV_EXP_WC_WITH_SL;
+		}
+		if (is_exp) {
+			if ((*cur_qp) && ((*cur_qp)->qp_cap_cache &
+			    MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP)) {
+			/* Only ConnectX-3 Pro reports checksum for now) */
+				exp_wc_flags |=
+				MLX4_TRANSPOSE(cqe->badfcs_enc,
+					MLX4_CQE_STATUS_L4_CSUM,
+					(uint64_t)IBV_EXP_WC_RX_TCP_UDP_CSUM_OK) |
+				mlx4_transpose_uint16_t(cqe->status,
+					htons(MLX4_CQE_STATUS_IPOK),
+					(uint64_t)IBV_EXP_WC_RX_IP_CSUM_OK) |
+				mlx4_transpose_uint16_t(cqe->status,
+					htons(MLX4_CQE_STATUS_IPV4),
+					(uint64_t)IBV_EXP_WC_RX_IPV4_PACKET) |
+				mlx4_transpose_uint16_t(cqe->status,
+					htons(MLX4_CQE_STATUS_IPV6),
+					(uint64_t)IBV_EXP_WC_RX_IPV6_PACKET) |
+				mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+					htonl(MLX4_CQE_L2_TUNNEL),
+					(uint64_t)IBV_EXP_WC_RX_TUNNEL_PACKET) |
+				mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+					htonl(MLX4_CQE_L2_TUNNEL_IPOK),
+					(uint64_t)IBV_EXP_WC_RX_OUTER_IP_CSUM_OK) |
+				mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+					htonl(MLX4_CQE_L2_TUNNEL_L4_CSUM),
+					(uint64_t)IBV_EXP_WC_RX_OUTER_TCP_UDP_CSUM_OK) |
+				mlx4_transpose_uint32_t(cqe->vlan_my_qpn,
+					htonl(MLX4_CQE_L2_TUNNEL_IPV4),
+					(uint64_t)IBV_EXP_WC_RX_OUTER_IPV4_PACKET);
+				exp_wc_flags |=
+				MLX4_TRANSPOSE(~exp_wc_flags,
+						IBV_EXP_WC_RX_OUTER_IPV4_PACKET,
+						IBV_EXP_WC_RX_OUTER_IPV6_PACKET);
+			}
+		}
 	}
 
+	if (is_exp)
+		wc->exp_wc_flags = exp_wc_flags | (uint64_t)wc_flags;
+
+	((struct ibv_wc *)wc)->wc_flags = wc_flags;
+
 	return CQ_OK;
 }
 
-int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+#if defined(__amd64__) || defined(__i386__)
+static inline unsigned long get_cycles()
+{
+	unsigned low, high;
+	unsigned long long val;
+	asm volatile ("rdtsc" : "=a" (low), "=d" (high));
+	val = high;
+	val = (val << 32) | low;
+	return val;
+}
+#else
+static inline unsigned long get_cycles()
+{
+	return 0;
+}
+#endif
+
+static void mlx4_stall_poll_cq()
+{
+	int i;
+
+	for (i = 0; i < mlx4_stall_num_loop; i++)
+		(void)get_cycles();
+}
+
+int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_exp_wc *wc,
+		 uint32_t wc_size, int is_exp)
 {
 	struct mlx4_cq *cq = to_mcq(ibcq);
 	struct mlx4_qp *qp = NULL;
 	int npolled;
 	int err = CQ_OK;
 
-	pthread_spin_lock(&cq->lock);
-
+	if (unlikely(cq->stall_next_poll)) {
+		cq->stall_next_poll = 0;
+		mlx4_stall_poll_cq();
+	}
+	mlx4_lock(&cq->lock);
+	
 	for (npolled = 0; npolled < ne; ++npolled) {
-		err = mlx4_poll_one(cq, &qp, wc + npolled);
-		if (err != CQ_OK)
+		err = mlx4_poll_one(cq, &qp, ((void *)wc) + npolled * wc_size,
+				    wc_size, is_exp);
+		if (unlikely(err != CQ_OK))
 			break;
 	}
 
-	if (npolled)
-		update_cons_index(cq);
+	if (likely(npolled || err == CQ_POLL_ERR))
+		mlx4_update_cons_index(cq);
 
-	pthread_spin_unlock(&cq->lock);
+	mlx4_unlock(&cq->lock);
 
+	if (unlikely(cq->stall_enable && err == CQ_EMPTY))
+		cq->stall_next_poll = 1;
+	
 	return err == CQ_POLL_ERR ? err : npolled;
 }
 
+int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries,
+		     struct ibv_exp_wc *wc, uint32_t wc_size)
+{
+	return mlx4_poll_cq(ibcq, num_entries, wc, wc_size, 1);
+}
+
+int mlx4_poll_ibv_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+{
+	return mlx4_poll_cq(ibcq, ne, (struct ibv_exp_wc *)wc, sizeof(*wc), 0);
+}
+
 int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
 {
 	struct mlx4_cq *cq = to_mcq(ibvcq);
@@ -402,12 +611,10 @@
 	uint32_t prod_index;
 	uint8_t owner_bit;
 	int nfreed = 0;
-	int is_xrc_srq = 0;
 	int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
 
-	if (srq && srq->ibv_srq.xrc_cq)
-		is_xrc_srq = 1;
-
+	if (cq->last_qp && cq->last_qp->verbs_qp.qp.qp_num == qpn)
+		cq->last_qp = NULL;
 	/*
 	 * First we need to find the current producer index, so we
 	 * know where to start cleaning from.  It doesn't matter if HW
@@ -426,12 +633,12 @@
 	while ((int) --prod_index - (int) cq->cons_index >= 0) {
 		cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
 		cqe += cqe_inc;
-		if (is_xrc_srq &&
-		    (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) &&
+		if (srq && srq->ext_srq &&
+		    ntohl(cqe->g_mlpath_rqpn & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num &&
 		    !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
 			mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
 			++nfreed;
-		} else if ((ntohl(cqe->my_qpn) & 0xffffff) == qpn) {
+		} else if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
 			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
 				mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
 			++nfreed;
@@ -452,22 +659,22 @@
 		 * updating consumer index.
 		 */
 		wmb();
-		update_cons_index(cq);
+		mlx4_update_cons_index(cq);
 	}
 }
 
 void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
 {
-	pthread_spin_lock(&cq->lock);
+	mlx4_lock(&cq->lock);
 	__mlx4_cq_clean(cq, qpn, srq);
-	pthread_spin_unlock(&cq->lock);
+	mlx4_unlock(&cq->lock);
 }
 
 int mlx4_get_outstanding_cqes(struct mlx4_cq *cq)
 {
 	uint32_t i;
 
-	for (i = cq->cons_index; get_sw_cqe(cq, (i & cq->ibv_cq.cqe)); ++i)
+	for (i = cq->cons_index; get_sw_cqe(cq, i); ++i)
 		;
 
 	return i - cq->cons_index;
@@ -496,13 +703,491 @@
 	++cq->cons_index;
 }
 
-int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+int mlx4_alloc_cq_buf(struct mlx4_context *mctx, struct mlx4_buf *buf, int nent,
 			int entry_size)
 {
-	if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size),
-			   dev->page_size))
+	struct mlx4_device *dev = to_mdev(mctx->ibv_ctx.device);
+	int ret;
+	enum mlx4_alloc_type alloc_type;
+	enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
+
+	if (mlx4_use_huge(&mctx->ibv_ctx, "HUGE_CQ"))
+		default_alloc_type = MLX4_ALLOC_TYPE_HUGE;
+
+	mlx4_get_alloc_type(&mctx->ibv_ctx, MLX4_CQ_PREFIX, &alloc_type,
+			    default_alloc_type);
+
+	ret = mlx4_alloc_prefered_buf(mctx, buf,
+			align(nent * entry_size, dev->page_size),
+			dev->page_size,
+			alloc_type,
+			MLX4_CQ_PREFIX);
+
+	if (ret)
 		return -1;
+
 	memset(buf->buf, 0, nent * entry_size);
 
 	return 0;
 }
+
+/*
+ *  poll  family functions
+ */
+static inline int drain_rx(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+			   struct mlx4_qp *cur_qp, uint8_t *buf, uint32_t *inl) __attribute__((always_inline));
+static inline int drain_rx(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+			   struct mlx4_qp *cur_qp, uint8_t *buf, uint32_t *inl)
+{
+	struct mlx4_srq *srq;
+	uint32_t qpn;
+	uint16_t wqe_index;
+
+	qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+
+
+	if (unlikely(!cur_qp || (qpn != cur_qp->verbs_qp.qp.qp_num))) {
+		if (unlikely(qpn & MLX4_XRC_QPN_BIT)) {
+			/*
+			 * We do not have to take the XSRQ table lock here,
+			 * because CQs will be locked while SRQs are removed
+			 * from the table.
+			 */
+			cur_qp = NULL;
+			srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table,
+					     ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
+			if (!srq)
+				return CQ_POLL_ERR;
+
+			/* Advance indexes only on success */
+			wqe_index = htons(cqe->wqe_index);
+			mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index);
+
+			++cq->cons_index;
+
+			return CQ_OK;
+		}
+
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+		if (unlikely(!cur_qp))
+			return CQ_POLL_ERR;
+		cq->last_qp = cur_qp;
+	}
+
+	if (!cur_qp->max_inlr_sg) {
+		/* Advance indexes only on success to enable getting
+		 * the full CQE with ibv_poll_cq in case of failure
+		 */
+		if (unlikely(cur_qp->verbs_qp.qp.srq)) {
+			wqe_index = htons(cqe->wqe_index);
+			mlx4_free_srq_wqe(to_msrq(cur_qp->verbs_qp.qp.srq), wqe_index);
+		} else {
+			++cur_qp->rq.tail;
+		}
+		++cq->cons_index;
+
+		return CQ_OK;
+	}
+
+	/* We get here only when cur_qp->max_inlr_sg != 0 */
+	if (likely(cqe->owner_sr_opcode & MLX4_CQE_INL_SCATTER_MASK)) {
+		int size;
+		int left;
+		int list_len;
+		int i;
+		struct mlx4_inlr_rbuff *rbuffs;
+		uint8_t *sbuff;
+		int is_error;
+
+		/* include checksum as work around for calc opcode */
+		is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+			   MLX4_CQE_OPCODE_ERROR && (cqe->checksum & 0xff);
+		if (unlikely(is_error))
+			return CQ_POLL_ERR;
+
+		wqe_index = cur_qp->rq.tail & (cur_qp->rq.wqe_cnt - 1);
+		sbuff = mlx4_get_recv_wqe(cur_qp, wqe_index);
+		left = ntohl(cqe->byte_cnt);
+		if (likely(buf)) {
+			*inl = 1;
+			memcpy(buf, sbuff, left);
+		} else {
+			rbuffs = cur_qp->inlr_buff.buff[wqe_index].sg_list;
+			list_len = cur_qp->inlr_buff.buff[wqe_index].list_len;
+			for (i = 0; (i < list_len) && left; i++) {
+				size = min(rbuffs->rlen, left);
+				memcpy(rbuffs->rbuff, sbuff, size);
+				left -= size;
+				rbuffs++;
+				sbuff += size;
+			}
+			if (left)
+				return CQ_POLL_ERR;
+		}
+	}
+
+	/* Advance indexes only on success to enable getting
+	 * the full CQE with ibv_poll_cq in case of failure
+	 */
+	++cur_qp->rq.tail;
+
+	++cq->cons_index;
+
+	return CQ_OK;
+}
+
+static inline int update_sq_tail(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+				 struct mlx4_qp *cur_qp) __attribute__((always_inline));
+static inline int update_sq_tail(struct mlx4_cq *cq, struct mlx4_cqe *cqe,
+				 struct mlx4_qp *cur_qp)
+{
+	uint32_t qpn;
+
+	qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+	if (unlikely(!cur_qp || (qpn != cur_qp->verbs_qp.qp.qp_num))) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+		if (unlikely(!cur_qp))
+			return CQ_POLL_ERR;
+		cq->last_qp = cur_qp;
+	}
+
+	/* Advance indexes only on success */
+	cur_qp->sq.tail +=  (uint16_t)(ntohs(cqe->wqe_index) - (uint16_t)cur_qp->sq.tail);
+	++cq->cons_index;
+
+	return CQ_OK;
+}
+
+static inline struct mlx4_cqe *get_next_cqe(struct mlx4_cq *cq, int const cqe_size) __attribute__((always_inline));
+static inline struct mlx4_cqe *get_next_cqe(struct mlx4_cq *cq, int const cqe_size)
+{
+	int cqe_off = (cqe_size & 64) >> 1; /* CQE offset is 32 bytes in case cqe_size == 64 */
+	struct mlx4_cqe *cqe = cq->buf.buf + (cq->cons_index & cq->ibv_cq.cqe) * cqe_size + cqe_off;
+
+	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+	    !!(cq->cons_index & (cq->ibv_cq.cqe + 1)))
+		return NULL;
+
+	VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	return cqe;
+}
+
+static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size) __attribute__((always_inline));
+static inline int32_t poll_cnt(struct ibv_cq *ibcq, uint32_t max_entries, const int use_lock, const int cqe_size)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+	struct mlx4_cqe *cqe;
+	int npolled;
+	int err = CQ_OK;
+
+	if (unlikely(use_lock))
+		mlx4_lock(&cq->lock);
+
+	for (npolled = 0; npolled < max_entries; ++npolled) {
+		cqe = get_next_cqe(cq, cqe_size);
+		if (!cqe) {
+			err = CQ_EMPTY;
+			break;
+		}
+		/*
+		 * Make sure we read CQ entry contents after we've checked the
+		 * ownership bit.
+		 */
+		rmb();
+
+		if (likely(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+			err = update_sq_tail(cq, cqe, cq->last_qp);
+		else
+			err = drain_rx(cq, cqe, cq->last_qp, NULL, NULL);
+
+		if (unlikely(err != CQ_OK))
+			break;
+	}
+
+	if (likely(npolled)) {
+		mlx4_update_cons_index(cq);
+		err = CQ_OK;
+	}
+
+	if (unlikely(use_lock))
+		mlx4_unlock(&cq->lock);
+
+	return err == CQ_POLL_ERR ? -1 : npolled;
+}
+
+static inline int32_t get_flags(struct mlx4_qp *cur_qp, struct mlx4_cqe *cqe) __attribute__((always_inline));
+static inline int32_t get_flags(struct mlx4_qp *cur_qp, struct mlx4_cqe *cqe)
+{
+	/* Only ConnectX-3 Pro reports checksum for now) */
+	if (likely(cur_qp && (cur_qp->qp_cap_cache &
+	    MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP))) {
+		int32_t flags;
+		int32_t tmp;
+
+		/*
+		 * The relevant bits are in different locations on their
+		 * CQE fields therefore we can join them in one 32bit
+		 * variable.
+		 */
+		tmp = (cqe->badfcs_enc & MLX4_CQE_STATUS_L4_CSUM) |
+		      (ntohs(cqe->status) & (MLX4_CQE_STATUS_IPOK |
+				             MLX4_CQE_STATUS_IPV4 |
+				             MLX4_CQE_STATUS_IPV6)) |
+		      (ntohl(cqe->vlan_my_qpn) & (MLX4_CQE_L2_TUNNEL |
+				      	          MLX4_CQE_L2_TUNNEL_IPOK |
+				      	          MLX4_CQE_L2_TUNNEL_L4_CSUM |
+				      	          MLX4_CQE_L2_TUNNEL_IPV4));
+		if (likely(tmp == cur_qp->cached_rx_csum_flags)) {
+			flags = cur_qp->transposed_rx_csum_flags;
+		} else {
+			flags = mlx4_transpose(tmp, MLX4_CQE_STATUS_IPOK,	IBV_EXP_CQ_RX_IP_CSUM_OK)	|
+				mlx4_transpose(tmp, MLX4_CQE_STATUS_L4_CSUM,	IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK)	|
+				mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV4,	IBV_EXP_CQ_RX_IPV4_PACKET)	|
+				mlx4_transpose(tmp, MLX4_CQE_STATUS_IPV6,	IBV_EXP_CQ_RX_IPV6_PACKET)	|
+				mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL,		IBV_EXP_CQ_RX_TUNNEL_PACKET)	|
+				mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPOK,	IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK)	|
+				mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_L4_CSUM,	IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK)	|
+				mlx4_transpose(tmp, MLX4_CQE_L2_TUNNEL_IPV4,	IBV_EXP_CQ_RX_OUTER_IPV4_PACKET)	|
+				mlx4_transpose(~tmp, MLX4_CQE_L2_TUNNEL_IPV4,	IBV_EXP_CQ_RX_OUTER_IPV6_PACKET);
+			cur_qp->cached_rx_csum_flags = tmp;
+			cur_qp->transposed_rx_csum_flags = flags;
+		}
+
+		return flags;
+	}
+
+	return 0;
+}
+
+static inline int32_t poll_length(struct ibv_cq *ibcq, void *buf, uint32_t *inl,
+				  const int use_lock, const int cqe_size,
+				  uint32_t *flags) __attribute__((always_inline));
+static inline int32_t poll_length(struct ibv_cq *ibcq, void *buf, uint32_t *inl,
+				  const int use_lock, const int cqe_size,
+				  uint32_t *flags)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+	struct mlx4_cqe *cqe;
+	int32_t size = 0;
+	int err;
+
+	if (unlikely(use_lock))
+		mlx4_lock(&cq->lock);
+
+	cqe = get_next_cqe(cq, cqe_size);
+	if (cqe) {
+		/*
+		 * Make sure we read CQ entry contents after we've checked the
+		 * ownership bit.
+		 */
+		rmb();
+		if (likely(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))) {
+			err = drain_rx(cq, cqe, cq->last_qp, buf, inl);
+			if (likely(err == CQ_OK)) {
+				size = ntohl(cqe->byte_cnt);
+				if (flags)
+					*flags = get_flags(cq->last_qp, cqe);
+				mlx4_update_cons_index(cq);
+			}
+		} else {
+			err = CQ_POLL_ERR;
+		}
+
+	} else {
+		err = CQ_EMPTY;
+	}
+
+
+	if (unlikely(use_lock))
+		mlx4_unlock(&cq->lock);
+
+	return err == CQ_POLL_ERR ? -1 : size;
+}
+
+int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_safe(struct ibv_cq *ibcq, uint32_t max)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+
+	return poll_cnt(ibcq, max, 1, cq->cqe_size);
+}
+
+int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_other(struct ibv_cq *ibcq, uint32_t max)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+
+	return poll_cnt(ibcq, max, 0, cq->cqe_size);
+}
+
+int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_cqe32(struct ibv_cq *ibcq, uint32_t max)
+{
+	return poll_cnt(ibcq, max, 0, 32);
+}
+
+int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_cqe64(struct ibv_cq *ibcq, uint32_t max)
+{
+	return poll_cnt(ibcq, max, 0, 64);
+}
+
+int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_cnt_unsafe_cqe128(struct ibv_cq *ibcq, uint32_t max)
+{
+	return poll_cnt(ibcq, max, 0, 128);
+}
+
+int32_t mlx4_poll_length_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+
+	return poll_length(ibcq, buf, inl, 1, cq->cqe_size, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+
+	return poll_length(ibcq, buf, inl, 0, cq->cqe_size, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl)
+{
+	return poll_length(cq, buf, inl, 0, 32, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl)
+{
+	return poll_length(cq, buf, inl, 0, 64, NULL);
+}
+
+int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl)
+{
+	return poll_length(cq, buf, inl, 0, 128, NULL);
+}
+
+int32_t mlx4_poll_length_flags_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_safe(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+
+	return poll_length(ibcq, buf, inl, 1, cq->cqe_size, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_other(struct ibv_cq *ibcq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+
+	return poll_length(ibcq, buf, inl, 0, cq->cqe_size, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_cqe32(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+	return poll_length(cq, buf, inl, 0, 32, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_cqe64(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+	return poll_length(cq, buf, inl, 0, 64, flags);
+}
+
+int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags) __MLX4_ALGN_FUNC__;
+int32_t mlx4_poll_length_flags_unsafe_cqe128(struct ibv_cq *cq, void *buf, uint32_t *inl, uint32_t *flags)
+{
+	return poll_length(cq, buf, inl, 0, 128, flags);
+}
+
+static struct ibv_exp_cq_family mlx4_poll_cq_family_safe = {
+	.poll_cnt = mlx4_poll_cnt_safe,
+	.poll_length = mlx4_poll_length_safe,
+	.poll_length_flags = mlx4_poll_length_flags_safe
+};
+
+enum mlx4_poll_cq_cqe_sizes {
+	MLX4_POLL_CQ_CQE_32		= 0,
+	MLX4_POLL_CQ_CQE_64		= 1,
+	MLX4_POLL_CQ_CQE_128		= 2,
+	MLX4_POLL_CQ_CQE_OTHER		= 3,
+	MLX4_POLL_CQ_NUM_CQE_SIZES	= 4,
+};
+
+static struct ibv_exp_cq_family mlx4_poll_cq_family_unsafe_tbl[MLX4_POLL_CQ_NUM_CQE_SIZES] = {
+		[MLX4_POLL_CQ_CQE_32] = {
+				.poll_cnt = mlx4_poll_cnt_unsafe_cqe32,
+				.poll_length = mlx4_poll_length_unsafe_cqe32,
+				.poll_length_flags = mlx4_poll_length_flags_unsafe_cqe32
+		},
+		[MLX4_POLL_CQ_CQE_64] = {
+				.poll_cnt = mlx4_poll_cnt_unsafe_cqe64,
+				.poll_length = mlx4_poll_length_unsafe_cqe64,
+				.poll_length_flags = mlx4_poll_length_flags_unsafe_cqe64
+		},
+		[MLX4_POLL_CQ_CQE_128] = {
+				.poll_cnt = mlx4_poll_cnt_unsafe_cqe128,
+				.poll_length = mlx4_poll_length_unsafe_cqe128,
+				.poll_length_flags = mlx4_poll_length_flags_unsafe_cqe128
+		},
+		[MLX4_POLL_CQ_CQE_OTHER] = {
+				.poll_cnt = mlx4_poll_cnt_unsafe_other,
+				.poll_length = mlx4_poll_length_unsafe_other,
+				.poll_length_flags = mlx4_poll_length_flags_unsafe_other
+		},
+};
+
+struct ibv_exp_cq_family *mlx4_get_poll_cq_family(struct mlx4_cq *cq,
+						  struct ibv_exp_query_intf_params *params,
+						  enum ibv_exp_query_intf_status *status)
+{
+	enum mlx4_poll_cq_cqe_sizes cqe_size = MLX4_POLL_CQ_CQE_OTHER;
+
+	if (params->flags) {
+		fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for CQ family\n", params->flags);
+		*status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED;
+
+		return NULL;
+	}
+	if (params->family_flags) {
+		fprintf(stderr, PFX "Family flags(0x%x) are not supported for CQ family\n", params->family_flags);
+		*status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED;
+
+		return NULL;
+	}
+
+	if (cq->model_flags & MLX4_CQ_MODEL_FLAG_THREAD_SAFE)
+		return &mlx4_poll_cq_family_safe;
+
+	if (cq->cqe_size == 32)
+		cqe_size = MLX4_POLL_CQ_CQE_32;
+	else if (cq->cqe_size == 64)
+		cqe_size = MLX4_POLL_CQ_CQE_64;
+	else if (cq->cqe_size == 128)
+		cqe_size = MLX4_POLL_CQ_CQE_128;
+
+	return &mlx4_poll_cq_family_unsafe_tbl[cqe_size];
+}
Index: contrib/ofed/libmlx4/src/doorbell.h
===================================================================
--- contrib/ofed/libmlx4/src/doorbell.h
+++ contrib/ofed/libmlx4/src/doorbell.h
@@ -33,7 +33,8 @@
 #ifndef DOORBELL_H
 #define DOORBELL_H
 
-#ifdef __LP64__
+#if __LP64__
+
 #if __BYTE_ORDER == __LITTLE_ENDIAN
 #  define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0])
 #elif __BYTE_ORDER == __BIG_ENDIAN
@@ -51,10 +52,10 @@
 
 static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
 {
-	pthread_spin_lock(&ctx->uar_lock);
+	mlx4_spin_lock(&ctx->uar_lock);
 	*(volatile uint32_t *) (ctx->uar + offset)     = val[0];
 	*(volatile uint32_t *) (ctx->uar + offset + 4) = val[1];
-	pthread_spin_unlock(&ctx->uar_lock);
+	mlx4_spin_unlock(&ctx->uar_lock);
 }
 
 #endif
Index: contrib/ofed/libmlx4/src/list.h
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/list.h
@@ -0,0 +1,330 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head *prev, struct list_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = LIST_POISON1;
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+	__list_del(list->prev, list->next);
+	list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+	__list_del(list->prev, list->next);
+	list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is
+ * empty _and_ checks that no other CPU might be
+ * in the process of still modifying either member
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ *
+ * @head: the list to test.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ *
+ * @ptr:        the pointer to the member.
+ * @type:       the type of the container struct this is embedded in.
+ * @member:     the name of the member within the struct.
+ *
+ */
+#ifndef container_of
+#define container_of(ptr, type, member) ({			\
+	const typeof(((type *)0)->member)*__mptr = (ptr);	\
+	(type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; prefetch(pos->next), pos != (head); \
+		pos->next)
+
+/**
+ * __list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @head:	the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \
+		pos = pos->prev)
+
+/**
+ * list_for_each_safe	-	iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member);	\
+		prefetch(pos->member.next), &pos->member != (head);	\
+		pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
+		prefetch(pos->member.prev), &pos->member != (head);	\
+		pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use as a start point in
+ *			list_for_each_entry_continue
+ * @pos:	the type * to use as a start point
+ * @head:	the head of the list
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_prepare_entry(pos, head, member) \
+	((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue -	iterate over list of given type
+ *			continuing after existing point
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_continue(pos, head, member)		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
+		prefetch(pos->member.next), &pos->member != (head);	\
+		pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member),	\
+		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head);					\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#endif
+
Index: contrib/ofed/libmlx4/src/mlx4-abi.h
===================================================================
--- contrib/ofed/libmlx4/src/mlx4-abi.h
+++ contrib/ofed/libmlx4/src/mlx4-abi.h
@@ -35,14 +35,22 @@
 
 #include <infiniband/kern-abi.h>
 
-#define MLX4_UVERBS_MIN_ABI_VERSION	2
+#define MLX4_UVERBS_MIN_ABI_VERSION	3
 #define MLX4_UVERBS_MAX_ABI_VERSION	4
 
+enum {
+	MLX4_USER_DEV_CAP_64B_CQE	= 1L << 0,
+#ifdef MLX4_WQE_FORMAT
+	MLX4_USER_DEV_CAP_WQE_FORMAT    = 1L << 1
+#endif
+};
+
 struct mlx4_alloc_ucontext_resp_v3 {
 	struct ibv_get_context_resp	ibv_resp;
 	__u32				qp_tab_size;
 	__u16				bf_reg_size;
 	__u16				bf_regs_per_page;
+	__u32				cqe_size;
 };
 
 struct mlx4_alloc_ucontext_resp {
@@ -54,6 +62,14 @@
 	__u32				cqe_size;
 };
 
+struct mlx4_alloc_ucontext_req {
+	struct ibv_get_context          cmd;
+#ifdef MLX4_WQE_FORMAT
+	__u32				lib_caps;
+	__u32				reserved;
+#endif
+};
+
 struct mlx4_alloc_pd_resp {
 	struct ibv_alloc_pd_resp	ibv_resp;
 	__u32				pdn;
@@ -77,16 +93,14 @@
 	__u64				buf_addr;
 };
 
-#ifdef HAVE_IBV_XRC_OPS
-struct mlx4_create_xrc_srq {
-	struct ibv_create_xrc_srq	ibv_cmd;
+struct mlx4_create_srq {
+	struct ibv_create_srq		ibv_cmd;
 	__u64				buf_addr;
 	__u64				db_addr;
 };
-#endif
 
-struct mlx4_create_srq {
-	struct ibv_create_srq		ibv_cmd;
+struct mlx4_create_xsrq {
+	struct ibv_create_xsrq		ibv_cmd;
 	__u64				buf_addr;
 	__u64				db_addr;
 };
@@ -97,8 +111,7 @@
 	__u32				reserved;
 };
 
-struct mlx4_create_qp {
-	struct ibv_create_qp		ibv_cmd;
+struct mlx4_create_qp_base {
 	__u64				buf_addr;
 	__u64				db_addr;
 	__u8				log_sq_bb_count;
@@ -107,12 +120,14 @@
 	__u8				reserved[5];
 };
 
-#ifdef HAVE_IBV_XRC_OPS
-struct mlx4_open_xrc_domain_resp {
-	struct ibv_open_xrc_domain_resp	ibv_resp;
-	__u32				xrcdn;
-	__u32				reserved;
+struct mlx4_exp_create_qp_provider {
+	struct mlx4_create_qp_base	base;
+	__u64				uar_virt_add;
+};
+
+struct mlx4_create_qp {
+	struct ibv_create_qp		ibv_cmd;
+	struct mlx4_create_qp_base	base;
 };
-#endif
 
 #endif /* MLX4_ABI_H */
Index: contrib/ofed/libmlx4/src/mlx4.h
===================================================================
--- contrib/ofed/libmlx4/src/mlx4.h
+++ contrib/ofed/libmlx4/src/mlx4.h
@@ -34,10 +34,32 @@
 #ifndef MLX4_H
 #define MLX4_H
 
+#include <stdio.h>
 #include <stddef.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
 
 #include <infiniband/driver.h>
+#include <infiniband/driver_exp.h>
 #include <infiniband/arch.h>
+#include <infiniband/verbs.h>
+#include <infiniband/verbs_exp.h>
+
+#define MLX4_MMAP_CMD_BITS 8
+#define MLX4_MMAP_GET_CONTIGUOUS_PAGES_CMD 2
+#define MLX4_IB_MMAP_GET_HW_CLOCK 3
+
+/* Use EXP mmap commands until it is pushed to upstream */
+#define MLX4_IB_EXP_MMAP_EXT_UAR_PAGE 0xFE
+#define MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE 0xFF
+
+#define MLX4_IB_MMAP_CMD_MASK 0xFF
+#define MLX4_CQ_PREFIX "MLX_CQ"
+#define MLX4_QP_PREFIX "MLX_QP"
+#define MLX4_MR_PREFIX "MLX_MR"
+#define MLX4_MAX_LOG2_CONTIG_BLOCK_SIZE 23
+#define MLX4_MIN_LOG2_CONTIG_BLOCK_SIZE 12
+#define MLX4_PORTS_NUM 2
 
 #ifdef HAVE_VALGRIND_MEMCHECK_H
 
@@ -69,7 +91,7 @@
 
 #if defined(__i386__)
 #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
-#elif defined(__x86_64__)
+#elif defined(__amd64__)
 #define wc_wmb() asm volatile("sfence" ::: "memory")
 #elif defined(__ia64__)
 #define wc_wmb() asm volatile("fwb" ::: "memory")
@@ -79,29 +101,93 @@
 
 #endif
 
-#ifndef HAVE_IBV_MORE_OPS
-#undef HAVE_IBV_XRC_OPS
-#undef HAVE_IBV_CREATE_QP_EXP
-#endif
-
 #define HIDDEN		__attribute__((visibility ("hidden")))
 
+#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if MLX4_GCC_VERSION >= 403
+#	define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64)))
+#	define __MLX4_ALGN_DATA__ __attribute__((aligned(64)))
+#else
+#	define __MLX4_ALGN_FUNC__
+#	define __MLX4_ALGN_DATA__
+#endif
+
 #define PFX		"mlx4: "
 
 #ifndef max
-#define max(a,b) \
+#define max(a, b) \
 	({ typeof (a) _a = (a); \
 	   typeof (b) _b = (b); \
 	   _a > _b ? _a : _b; })
 #endif
 
 #ifndef min
-#define min(a,b) \
+#define min(a, b) \
 	({ typeof (a) _a = (a); \
 	   typeof (b) _b = (b); \
 	   _a < _b ? _a : _b; })
 #endif
 
+#ifndef likely
+#ifdef __GNUC__
+#define likely(x)       __builtin_expect(!!(x),1)
+#else
+#define likely(x)	(x)
+#endif
+#endif
+
+
+#ifndef unlikely
+#ifdef __GNUC__
+#define unlikely(x)       __builtin_expect(!!(x), 0)
+#else
+#define unlikely(x)       (x)
+#endif
+#endif
+
+#ifndef uninitialized_var
+#define uninitialized_var(x) x = x
+#endif
+
+#include "list.h"
+
+/****************************************/
+/* ioctl codes */
+/****************************************/
+#define MLX4_IOC_MAGIC 'm'
+#define MLX4_IOCHWCLOCKOFFSET _IOR(MLX4_IOC_MAGIC, 1, int)
+
+/* Generic macro to convert MLX4 to IBV flags. */
+#define MLX4_TRANSPOSE(val, from, to) \
+	(((from) >= (to)) ? \
+	 (((val) & (from)) / ((from) / (to))) : \
+	 (((val) & (from)) * ((to) / (from))))
+
+static inline uint64_t mlx4_transpose_uint16_t(uint16_t val, uint16_t from, uint64_t to)
+{
+	return MLX4_TRANSPOSE(val, from, to);
+}
+
+static inline uint64_t mlx4_transpose_uint32_t(uint32_t val, uint32_t from, uint64_t to)
+{
+	return MLX4_TRANSPOSE(val, from, to);
+}
+
+static inline uint32_t mlx4_transpose(uint32_t val, uint32_t from, uint32_t to)
+{
+	return MLX4_TRANSPOSE(val, from, to);
+}
+
+enum {
+	MLX4_MAX_FAMILY_VER = 0
+};
+
+enum {
+	MLX4_MAX_BFS_IN_PAGE	= 8,
+	MLX4_BFS_STRIDE		= 512,
+};
+
 enum {
 	MLX4_STAT_RATE_OFFSET		= 5
 };
@@ -112,14 +198,86 @@
 	MLX4_QP_TABLE_MASK		= MLX4_QP_TABLE_SIZE - 1
 };
 
+#define MLX4_REMOTE_SRQN_FLAGS(wr) htonl(wr->qp_type.xrc.remote_srqn << 8)
+
 enum {
-	MLX4_XRC_SRQ_TABLE_BITS		= 8,
-	MLX4_XRC_SRQ_TABLE_SIZE		= 1 << MLX4_XRC_SRQ_TABLE_BITS,
-	MLX4_XRC_SRQ_TABLE_MASK		= MLX4_XRC_SRQ_TABLE_SIZE - 1
+	MLX4_XSRQ_TABLE_BITS = 8,
+	MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+	MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
 };
 
 enum {
-	MLX4_XRC_QPN_BIT		= (1 << 23)
+	MLX4_QP_PATTERN		= 0x012389AB,
+	MLX4_CQ_PATTERN		= 0x4567CDEF
+};
+
+enum mlx4_lock_type {
+	MLX4_SPIN_LOCK = 0,
+	MLX4_MUTEX = 1,
+};
+
+enum mlx4_lock_state {
+	MLX4_USE_LOCK,
+	MLX4_LOCKED,
+	MLX4_UNLOCKED
+};
+
+/* QP DoorBell ringing methods */
+enum mlx4_db_method {
+	MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB,/* QP has dedicated BF,			*/
+							/* only one thread is using this QP,	*/
+							/* the arch supports WC auto evict and	*/
+							/* prefer_bf flag is set.		*/
+							/* This means that there is no need for	*/
+							/* wc_wmb to flush the WC buffer	*/
+	MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB, /* Same as previous but prefer_bf	*/
+							  /* flag is not set			*/
+	MLX4_QP_DB_METHOD_DEDIC_BF, /* QP has dedicated BF */
+	MLX4_QP_DB_METHOD_BF, /* QP has BF which may be shared with other QPs */
+	MLX4_QP_DB_METHOD_DB /* BF is not valid for this QP, use DoorBell to send the messages */
+};
+
+enum mlx4_res_domain_bf_type {
+	MLX4_RES_DOMAIN_BF_NONE, /* No BF for this resource domain */
+	MLX4_RES_DOMAIN_BF_SAFE, /* Use BF when possible */
+	MLX4_RES_DOMAIN_BF_UNSAFE, /* Use BF when possible.				*/
+				   /* The application is responsible to sync between	*/
+				   /* calls to objects using this resource domain.	*/
+				   /* This means that there is no need to use the BF	*/
+				   /* lock.						*/
+	MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT, /* Use BF when possible.			*/
+					    /* Only one thread is using this resource	*/
+					    /* and the arch supports WC auto-evict.	*/
+					    /* This means that there is no need to use	*/
+					    /* wc_wmb function to flush the BF buffer	*/
+
+};
+
+struct mlx4_xsrq_table {
+	struct {
+		struct mlx4_srq **table;
+		int		  refcnt;
+	} xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+
+	pthread_mutex_t		  mutex;
+	int			  num_xsrq;
+	int			  shift;
+	int			  mask;
+};
+
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+		    struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+
+enum {
+	MLX4_XRC_QPN_BIT     = (1 << 23)
+};
+
+enum qp_cap_cache {
+	/* The flag below includes VXLAN support as well in mlx4 HW*/
+	MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP		= 1 << 1
 };
 
 enum mlx4_db_type {
@@ -128,6 +286,15 @@
 	MLX4_NUM_DB_TYPE
 };
 
+enum mlx4_alloc_type {
+	MLX4_ALLOC_TYPE_ANON,
+	MLX4_ALLOC_TYPE_HUGE,
+	MLX4_ALLOC_TYPE_CONTIG,
+	MLX4_ALLOC_TYPE_PREFER_HUGE,
+	MLX4_ALLOC_TYPE_PREFER_CONTIG,
+	MLX4_ALLOC_TYPE_ALL
+};
+
 enum {
 	MLX4_OPCODE_NOP			= 0x00,
 	MLX4_OPCODE_SEND_INVAL		= 0x01,
@@ -146,6 +313,12 @@
 	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
 	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
 
+	MLX4_OPCODE_SEND_ENABLE		= 0x17,
+	MLX4_OPCODE_RECV_ENABLE		= 0x16,
+	MLX4_OPCODE_CQE_WAIT		= 0x0f,
+	MLX4_OPCODE_CALC_SEND		= 0x1e,
+	MLX4_OPCODE_CALC_RDMA_WRITE_IMM	= 0x1f,
+
 	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
 	MLX4_RECV_OPCODE_SEND		= 0x01,
 	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
@@ -155,28 +328,86 @@
 	MLX4_CQE_OPCODE_RESIZE		= 0x16,
 };
 
+extern int mlx4_stall_num_loop;
+extern int mlx4_trace;
+extern int mlx4_single_threaded;
+extern int mlx4_use_mutex;
+
 enum {
 	MLX4_MAX_WQE_SIZE = 1008
 };
 
 struct mlx4_device {
-	struct ibv_device		ibv_dev;
+	struct verbs_device		verbs_dev;
 	int				page_size;
-	int				driver_abi_ver;
+
+	struct {
+		unsigned id;
+		unsigned short rev;
+	} devid;
+	int	driver_abi_ver;
 };
 
 struct mlx4_db_page;
 
+struct mlx4_lock {
+	pthread_mutex_t			mutex;
+	pthread_spinlock_t		slock;
+	enum mlx4_lock_state		state;
+	enum mlx4_lock_type		type;
+};
+
+struct mlx4_spinlock {
+	pthread_spinlock_t		lock;
+	enum mlx4_lock_state		state;
+};
+
+/* struct for BF dedicated for one QP */
+struct mlx4_dedic_bf {
+	void			       *address;
+};
+
+/* struct for the common BF which may be shared by many QPs */
+struct mlx4_cmn_bf {
+	void			       *address;
+	/*
+	 * Protect usage of BF address field including data written to the BF
+	 * and the BF buffer toggling.
+	 */
+	struct mlx4_lock		lock;
+};
+
+union mlx4_bf {
+	struct mlx4_dedic_bf		dedic;
+	struct mlx4_cmn_bf		cmn;
+};
+
+struct mlx4_bfs_data {
+	struct mlx4_dedic_bf		dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1];
+	struct mlx4_cmn_bf		cmn_bf;
+	uint8_t				dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1];
+	uint8_t				dedic_bf_free;
+	struct mlx4_spinlock		dedic_bf_lock; /* protect dedicated BFs managing */
+						       /* including dedic_bf_used and	 */
+						       /* dedic_bf_free fields		 */
+	void			       *page;
+	uint16_t			buf_size;
+	uint8_t				num_dedic_bfs;
+};
+
 struct mlx4_context {
-	struct ibv_context		ibv_ctx;
+	union {
+		struct ibv_context      ibv_ctx;
+	};
 
+	struct mlx4_spinlock		send_db_lock; /* protects send_db_list and send_db_num_uars */
+	struct list_head		send_db_list;
+	unsigned int			send_db_num_uars;
 	void			       *uar;
-	pthread_spinlock_t		uar_lock;
-
-	void			       *bf_page;
-	int				bf_buf_size;
-	int				bf_offset;
-	pthread_spinlock_t		bf_lock;
+	struct mlx4_spinlock		uar_lock;
+	struct mlx4_bfs_data		bfs;
+	int				bf_regs_per_page;
+	int				max_ctx_res_domain;
 
 	struct {
 		struct mlx4_qp	      **table;
@@ -189,24 +420,39 @@
 	int				max_qp_wr;
 	int				max_sge;
 	int				max_cqe;
-	int				cqe_size;
-
+	uint64_t			exp_device_cap_flags;
 	struct {
-		struct mlx4_srq       **table;
-		int			refcnt;
-	}				xrc_srq_table[MLX4_XRC_SRQ_TABLE_SIZE];
-	pthread_mutex_t			xrc_srq_table_mutex;
-	int				num_xrc_srqs;
-	int				xrc_srq_table_shift;
-	int				xrc_srq_table_mask;
+		int				offset;
+		int				mult;
+		int				shift;
+		uint64_t			mask;
+	} core_clk;
+	void			       *hca_core_clock;
+
+	struct mlx4_xsrq_table		xsrq_table;
 
 	struct mlx4_db_page	       *db_list[MLX4_NUM_DB_TYPE];
 	pthread_mutex_t			db_list_mutex;
+	int				cqe_size;
+	int				prefer_bf;
+	struct mlx4_spinlock			hugetlb_lock;
+	struct list_head			hugetlb_list;
+	int				stall_enable;
+	pthread_mutex_t			task_mutex;
+	struct {
+		uint8_t			valid;
+		uint8_t			link_layer;
+		enum ibv_port_cap_flags	caps;
+	} port_query_cache[MLX4_PORTS_NUM];
+	pthread_mutex_t			env_mtx;
+	int				env_initialized;
 };
 
 struct mlx4_buf {
 	void			       *buf;
+	void			       *hmem;
 	size_t				length;
+	int				base;
 };
 
 struct mlx4_pd {
@@ -214,23 +460,40 @@
 	uint32_t			pdn;
 };
 
+enum mlx4_cq_model_flags {
+	/*
+	 * When set the CQ API must be thread safe.
+	 * When reset application is taking care
+	 * to sync between CQ API calls.
+	 */
+	MLX4_CQ_MODEL_FLAG_THREAD_SAFE = 1 << 0,
+};
+
 struct mlx4_cq {
-	struct ibv_cq			ibv_cq;
+	struct ibv_cq			ibv_cq __MLX4_ALGN_DATA__;
+	uint32_t			pattern;
 	struct mlx4_buf			buf;
 	struct mlx4_buf			resize_buf;
-	pthread_spinlock_t		lock;
+	struct mlx4_lock		lock;
 	uint32_t			cqn;
 	uint32_t			cons_index;
+	uint32_t                        wait_index;
+	uint32_t                        wait_count;
 	uint32_t		       *set_ci_db;
 	uint32_t		       *arm_db;
 	int				arm_sn;
-	int				cqe_size;
+	int				stall_next_poll;
+	int				stall_enable;
+	int                             cqe_size;
+	int				creation_flags;
+	struct mlx4_qp			*last_qp;
+	uint32_t			model_flags; /* use mlx4_cq_model_flags */
 };
 
 struct mlx4_srq {
-	struct ibv_srq			ibv_srq;
+	struct verbs_srq		verbs_srq;
 	struct mlx4_buf			buf;
-	pthread_spinlock_t		lock;
+	struct mlx4_spinlock		lock;
 	uint64_t		       *wrid;
 	uint32_t			srqn;
 	int				max;
@@ -240,33 +503,102 @@
 	int				tail;
 	uint32_t		       *db;
 	uint16_t			counter;
+	uint8_t				ext_srq;
+	struct ibv_srq_legacy *ibv_srq_legacy;
 };
 
 struct mlx4_wq {
 	uint64_t		       *wrid;
-	pthread_spinlock_t		lock;
+	struct mlx4_lock		lock;
 	int				wqe_cnt;
 	int				max_post;
+	char				*buf;
 	unsigned			head;
 	unsigned			tail;
 	int				max_gs;
 	int				wqe_shift;
-	int				offset;
+
+	/* SEND/RECV_ENABLE data */
+	unsigned			head_en_index;
+	unsigned			head_en_count;
+};
+
+/* enclosing ibv_mr adding some extra managing information */
+struct mlx4_mr {
+	struct ibv_mr ibv_mr;
+	struct mlx4_buf buf;
+	uint64_t allocation_flags;
+	int shared_mr;
+};
+
+
+struct mlx4_inlr_rbuff {
+	void *rbuff;
+	int rlen;
+};
+
+struct mlx4_inlr_sg_list {
+	struct mlx4_inlr_rbuff *sg_list;
+	int list_len;
+};
+
+struct mlx4_inlr_buff {
+	struct mlx4_inlr_sg_list *buff;
+	int len;
+};
+
+struct mlx4_send_db_data {
+	union mlx4_bf		 bf;
+	uint32_t		*db_addr; /* Points to the BF related send DB */
+	struct list_head	 list;
+};
+
+enum mlx4_qp_model_flags {
+	/*
+	 * When set the QP API must be thread safe.
+	 * When reset application is taking care
+	 * to sync between QP API calls.
+	 */
+	MLX4_QP_MODEL_FLAG_THREAD_SAFE = 1 << 0,
 };
 
 struct mlx4_qp {
-	struct ibv_qp			ibv_qp;
-	struct mlx4_buf			buf;
-	int				max_inline_data;
+	struct verbs_qp			verbs_qp;
+	uint32_t			pattern;
 	int				buf_size;
-
+	uint32_t			model_flags; /* use mlx4_qp_model_flags */
+
+	/* hot post send data */
+	struct mlx4_wq			sq __MLX4_ALGN_DATA__;
+	int				(*post_send_one)(struct ibv_send_wr *wr,
+							 struct mlx4_qp *qp,
+							 void *wqe, int *total_size,
+							 int *inl, unsigned int ind);
+	union mlx4_bf		       *bf;
+	uint32_t		       *sdb;	/* send DB */
+	struct mlx4_buf			buf;
+	unsigned			last_db_head;
 	uint32_t			doorbell_qpn;
-	uint32_t			sq_signal_bits;
-	int				sq_spare_wqes;
-	struct mlx4_wq			sq;
-
+	uint32_t			create_flags;
+	uint16_t			max_inline_data;
+	uint16_t			bf_buf_size;
+	uint16_t			sq_spare_wqes;
+	uint8_t				srcrb_flags_tbl[16];
+	uint8_t				db_method;
+	uint8_t				qp_type;
+	/* RAW_PACKET hot data */
+	uint8_t				link_layer;
+	/* EXT_MASKED_ATOMIC hot data */
+	uint8_t				is_masked_atomic;
+
+	/* post receive hot data */
+	struct mlx4_wq			rq __MLX4_ALGN_DATA__;
 	uint32_t		       *db;
-	struct mlx4_wq			rq;
+	uint32_t			max_inlr_sg;
+	int32_t				cached_rx_csum_flags;
+	int32_t				transposed_rx_csum_flags;
+	struct mlx4_inlr_buff		inlr_buff;
+	uint8_t				qp_cap_cache;
 };
 
 struct mlx4_av {
@@ -280,7 +612,6 @@
 	uint8_t				hop_limit;
 	uint32_t			sl_tclass_flowlabel;
 	uint8_t				dgid[16];
-	uint8_t				mac[8];
 };
 
 struct mlx4_ah {
@@ -288,18 +619,20 @@
 	struct mlx4_av			av;
 	uint16_t			vlan;
 	uint8_t				mac[6];
-	uint8_t				tagged;
 };
 
-struct mlx4_xrc_domain {
-	struct ibv_xrc_domain		ibv_xrcd;
-	uint32_t			xrcdn;
+struct mlx4_res_domain {
+	struct ibv_exp_res_domain		 ibv_res_domain;
+	struct ibv_exp_res_domain_init_attr	 attr;
+	enum mlx4_res_domain_bf_type		 type;
+	struct mlx4_send_db_data		*send_db;
 };
 
 static inline unsigned long align(unsigned long val, unsigned long align)
 {
 	return (val + align - 1) & ~(align - 1);
 }
+int align_queue_size(int req);
 
 #define to_mxxx(xxx, type)						\
 	((struct mlx4_##type *)					\
@@ -307,7 +640,10 @@
 
 static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev)
 {
-	return to_mxxx(dev, device);
+	/* ibv_device is first field of verbs_device
+	  * see try_driver in libibverbs.
+	*/
+	return container_of(ibdev, struct mlx4_device, verbs_dev);
 }
 
 static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
@@ -327,32 +663,53 @@
 
 static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq)
 {
-	return to_mxxx(srq, srq);
+	return container_of(container_of(ibsrq, struct verbs_srq, srq),
+			    struct mlx4_srq, verbs_srq);
 }
 
 static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
 {
-	return to_mxxx(qp, qp);
+	return container_of(container_of(ibqp, struct verbs_qp, qp),
+			    struct mlx4_qp, verbs_qp);
 }
 
+static inline struct mlx4_mr *to_mmr(struct ibv_mr *ibmr)
+{
+	return to_mxxx(mr, mr);
+}
 static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
 {
 	return to_mxxx(ah, ah);
 }
 
-#ifdef HAVE_IBV_XRC_OPS
-static inline struct mlx4_xrc_domain *to_mxrcd(struct ibv_xrc_domain *ibxrcd)
+static inline struct mlx4_res_domain *to_mres_domain(struct ibv_exp_res_domain *ibres_domain)
 {
-	return to_mxxx(xrcd, xrc_domain);
+	return to_mxxx(res_domain, res_domain);
 }
-#endif
 
+int update_port_data(struct ibv_qp *qp, uint8_t port_num);
 int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
 void mlx4_free_buf(struct mlx4_buf *buf);
+int mlx4_alloc_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf,
+			size_t size, int page_size);
+int mlx4_alloc_buf_contig(struct mlx4_context *mctx, struct mlx4_buf *buf,
+			size_t size, int page_size, const char *component, void *req_addr);
+int mlx4_alloc_prefered_buf(struct mlx4_context *mctx,
+				struct mlx4_buf *buf,
+				size_t size, int page_size,
+				enum mlx4_alloc_type alloc_type,
+				const char *component);
+void mlx4_get_alloc_type(struct ibv_context *context, const char *component,
+			 enum mlx4_alloc_type *alloc_type,
+			 enum mlx4_alloc_type default_alloc_type);
+void mlx4_free_buf_huge(struct mlx4_context *mctx, struct mlx4_buf *buf);
+int mlx4_use_huge(struct ibv_context *context, const char *key);
 
 uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
 void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db);
 
+int __mlx4_query_device(uint64_t raw_fw_ver,
+			struct ibv_device_attr *attr);
 int mlx4_query_device(struct ibv_context *context,
 		       struct ibv_device_attr *attr);
 int mlx4_query_port(struct ibv_context *context, uint8_t port,
@@ -360,19 +717,42 @@
 
 struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context);
 int mlx4_free_pd(struct ibv_pd *pd);
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+				struct ibv_xrcd_init_attr *attr);
+int mlx4_close_xrcd(struct ibv_xrcd *xrcd);
 
 struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
-			    size_t length, enum ibv_access_flags access);
+			   size_t length, int access);
+struct ibv_mr *mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in *in);
+int mlx4_exp_post_send(struct ibv_qp *ibqp, struct ibv_exp_send_wr *wr,
+		       struct ibv_exp_send_wr **bad_wr);
+void mlx4_update_post_send_one(struct mlx4_qp *qp);
+struct ibv_exp_qp_burst_family *mlx4_get_qp_burst_family(struct mlx4_qp *qp,
+							 struct ibv_exp_query_intf_params *params,
+							 enum ibv_exp_query_intf_status *status);
+struct ibv_exp_cq_family *mlx4_get_poll_cq_family(struct mlx4_cq *cq,
+						  struct ibv_exp_query_intf_params *params,
+						  enum ibv_exp_query_intf_status *status);
+
+struct ibv_mr *mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in);
 int mlx4_dereg_mr(struct ibv_mr *mr);
 
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type);
+int mlx4_dealloc_mw(struct ibv_mw *mw);
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+		 struct ibv_mw_bind *mw_bind);
+int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind);
+
 struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
 			       struct ibv_comp_channel *channel,
 			       int comp_vector);
-int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+int mlx4_alloc_cq_buf(struct mlx4_context *mctx, struct mlx4_buf *buf, int nent,
 		      int entry_size);
 int mlx4_resize_cq(struct ibv_cq *cq, int cqe);
 int mlx4_destroy_cq(struct ibv_cq *cq);
-int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_poll_ibv_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_exp_poll_cq(struct ibv_cq *ibcq, int num_entries,
+		     struct ibv_exp_wc *wc, uint32_t wc_size) __MLX4_ALGN_FUNC__;
 int mlx4_arm_cq(struct ibv_cq *cq, int solicited);
 void mlx4_cq_event(struct ibv_cq *cq);
 void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
@@ -382,76 +762,207 @@
 
 struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
 				 struct ibv_srq_init_attr *attr);
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+				   struct ibv_srq_init_attr_ex *attr_ex);
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+				    struct ibv_srq_init_attr_ex *attr_ex);
 int mlx4_modify_srq(struct ibv_srq *srq,
 		     struct ibv_srq_attr *attr,
-		     enum ibv_srq_attr_mask mask);
+		     int mask);
 int mlx4_query_srq(struct ibv_srq *srq,
 			   struct ibv_srq_attr *attr);
 int mlx4_destroy_srq(struct ibv_srq *srq);
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq);
 int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
 			struct mlx4_srq *srq);
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+		    struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
 void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
 int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
 		       struct ibv_recv_wr *wr,
 		       struct ibv_recv_wr **bad_wr);
-struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
-int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
-		       struct mlx4_srq *srq);
-void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn);
 
 struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+				 struct ibv_qp_init_attr_ex *attr);
+int mlx4_modify_cq(struct ibv_cq *cq, struct ibv_exp_cq_attr *attr, int attr_mask);
+int mlx4_post_task(struct ibv_context *context,
+			struct ibv_exp_task *task_list,
+			struct ibv_exp_task **bad_task);
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr);
 int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
-		   enum ibv_qp_attr_mask attr_mask,
+		   int attr_mask,
 		   struct ibv_qp_init_attr *init_attr);
 int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
-		    enum ibv_qp_attr_mask attr_mask);
+		    int attr_mask);
+int mlx4_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr,
+		       uint64_t attr_mask);
 int mlx4_destroy_qp(struct ibv_qp *qp);
+void *mlx4_get_recv_wqe(struct mlx4_qp *qp, int n);
 void mlx4_init_qp_indices(struct mlx4_qp *qp);
 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp);
 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
-			  struct ibv_send_wr **bad_wr);
+		   struct ibv_send_wr **bad_wr) __MLX4_ALGN_FUNC__;
 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
-			  struct ibv_recv_wr **bad_wr);
+		   struct ibv_recv_wr **bad_wr) __MLX4_ALGN_FUNC__;
 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
 			   struct mlx4_qp *qp);
 int num_inline_segs(int data, enum ibv_qp_type type);
-int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
-		       enum ibv_qp_type type, struct mlx4_qp *qp);
+void mlx4_dealloc_qp_buf(struct ibv_context *context, struct mlx4_qp *qp);
 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
 		       enum ibv_qp_type type);
 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn);
 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp);
 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn);
+struct ibv_ah *mlx4_create_ah_common(struct ibv_pd *pd,
+				     struct ibv_ah_attr *attr,
+				     uint8_t link_layer);
 struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+struct ibv_ah *mlx4_exp_create_ah(struct ibv_pd *pd,
+				  struct ibv_exp_ah_attr *attr_ex);
 int mlx4_destroy_ah(struct ibv_ah *ah);
 int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
 		   struct mlx4_ah *ah);
 void mlx4_free_av(struct mlx4_ah *ah);
-#ifdef HAVE_IBV_XRC_OPS
-struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
-				    struct ibv_xrc_domain *xrc_domain,
-				    struct ibv_cq *xrc_cq,
-				    struct ibv_srq_init_attr *attr);
-struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
-					    int fd, int oflag);
-
-int mlx4_close_xrc_domain(struct ibv_xrc_domain *d);
-int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr,
-			   uint32_t *xrc_qp_num);
-int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			   uint32_t xrc_qp_num,
-			   struct ibv_qp_attr *attr,
-			   int attr_mask);
-int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			  uint32_t xrc_qp_num,
-			  struct ibv_qp_attr *attr,
-			  int attr_mask,
-			  struct ibv_qp_init_attr *init_attr);
-int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			uint32_t xrc_qp_num);
-int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			uint32_t xrc_qp_num);
-#endif
+struct ibv_cq *mlx4_create_cq_ex(struct ibv_context *context,
+				 int cqe,
+				 struct ibv_comp_channel *channel,
+				 int comp_vector,
+				 struct ibv_exp_cq_init_attr *attr);
+int mlx4_query_values(struct ibv_context *context, int q_values,
+		      struct ibv_exp_values *values);
+void *mlx4_get_legacy_xrc(struct ibv_srq *srq);
+void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq);
+void read_init_vars(struct mlx4_context *ctx);
+
+static inline enum mlx4_lock_type mlx4_get_locktype(void)
+{
+	if (!mlx4_use_mutex)
+		return MLX4_SPIN_LOCK;
+
+	return MLX4_MUTEX;
+}
+
+static inline int mlx4_spin_lock(struct mlx4_spinlock *lock)
+{
+	if (lock->state == MLX4_USE_LOCK)
+		return pthread_spin_lock(&lock->lock);
+
+	if (unlikely(lock->state == MLX4_LOCKED)) {
+		fprintf(stderr, "*** ERROR: multithreading violation ***\n"
+			"You are running a multithreaded application but\n"
+			"you set MLX4_SINGLE_THREADED=1. Please unset it.\n");
+		abort();
+	} else {
+		lock->state = MLX4_LOCKED;
+		wmb();
+	}
+
+	return 0;
+}
+
+static inline int mlx4_spin_unlock(struct mlx4_spinlock *lock)
+{
+	if (lock->state == MLX4_USE_LOCK)
+		return pthread_spin_unlock(&lock->lock);
+
+	lock->state = MLX4_UNLOCKED;
+
+	return 0;
+}
+
+static inline int mlx4_lock(struct mlx4_lock *lock)
+{
+	if (lock->state == MLX4_USE_LOCK) {
+		if (lock->type == MLX4_SPIN_LOCK)
+			return pthread_spin_lock(&lock->slock);
+
+		return pthread_mutex_lock(&lock->mutex);
+	}
+
+	if (unlikely(lock->state == MLX4_LOCKED)) {
+		fprintf(stderr, "*** ERROR: multithreading violation ***\n"
+			"You are running a multithreaded application but\n"
+			"you set MLX4_SINGLE_THREADED=1. Please unset it.\n");
+		abort();
+	} else {
+		lock->state = MLX4_LOCKED;
+		/* Make new state visable to other threads. */
+		wmb();
+	}
+
+	return 0;
+}
+
+static inline int mlx4_unlock(struct mlx4_lock *lock)
+{
+	if (lock->state == MLX4_USE_LOCK) {
+		if (lock->type == MLX4_SPIN_LOCK)
+			return pthread_spin_unlock(&lock->slock);
+
+		return pthread_mutex_unlock(&lock->mutex);
+	}
+	lock->state = MLX4_UNLOCKED;
+
+	return 0;
+}
 
+static inline int mlx4_spinlock_init(struct mlx4_spinlock *lock, int use_spinlock)
+{
+	if (use_spinlock) {
+		lock->state = MLX4_USE_LOCK;
+		return pthread_spin_init(&lock->lock, PTHREAD_PROCESS_PRIVATE);
+	}
+	lock->state = MLX4_UNLOCKED;
+
+	return 0;
+}
+
+static inline int mlx4_spinlock_destroy(struct mlx4_spinlock *lock)
+{
+	if (lock->state == MLX4_USE_LOCK)
+		return pthread_spin_destroy(&lock->lock);
+
+	return 0;
+}
+
+static inline int mlx4_lock_init(struct mlx4_lock *lock,
+				 int use_lock,
+				 enum mlx4_lock_type lock_type)
+{
+	if (use_lock) {
+		lock->type = lock_type;
+		lock->state = MLX4_USE_LOCK;
+		if (lock->type == MLX4_SPIN_LOCK)
+			return pthread_spin_init(&lock->slock,
+						  PTHREAD_PROCESS_PRIVATE);
+
+		return pthread_mutex_init(&lock->mutex,
+					  PTHREAD_PROCESS_PRIVATE);
+	}
+	lock->state = MLX4_UNLOCKED;
+
+	return 0;
+}
+
+static inline int mlx4_lock_destroy(struct mlx4_lock *lock)
+{
+	if (lock->state == MLX4_USE_LOCK) {
+		if (lock->type == MLX4_SPIN_LOCK)
+			return pthread_spin_destroy(&lock->slock);
+
+		return pthread_mutex_destroy(&lock->mutex);
+	}
+
+	return 0;
+}
+
+static inline void mlx4_update_cons_index(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
+}
 
 #endif /* MLX4_H */
Index: contrib/ofed/libmlx4/src/mlx4.c
===================================================================
--- contrib/ofed/libmlx4/src/mlx4.c
+++ contrib/ofed/libmlx4/src/mlx4.c
@@ -41,18 +41,27 @@
 #include <sys/mman.h>
 #include <pthread.h>
 #include <string.h>
-
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sched.h>
 #ifndef HAVE_IBV_REGISTER_DRIVER
 #include <sysfs/libsysfs.h>
 #endif
+#include <sys/cpuset.h>
 
 #include "mlx4.h"
 #include "mlx4-abi.h"
+#include "mlx4_exp.h"
+
 
 #ifndef PCI_VENDOR_ID_MELLANOX
 #define PCI_VENDOR_ID_MELLANOX			0x15b3
 #endif
 
+int mlx4_trace = 0;
+int mlx4_single_threaded = 0;
+int mlx4_use_mutex = 0;
+
 #define HCA(v, d) \
 	{ .vendor = PCI_VENDOR_ID_##v,			\
 	  .device = d }
@@ -66,47 +75,30 @@
 	HCA(MELLANOX, 0x6354),	/* MT25408 "Hermon" QDR */
 	HCA(MELLANOX, 0x6732),	/* MT25408 "Hermon" DDR PCIe gen2 */
 	HCA(MELLANOX, 0x673c),	/* MT25408 "Hermon" QDR PCIe gen2 */
-	HCA(MELLANOX, 0x6368), /* MT25448 [ConnectX EN 10GigE, PCIe 2.0 2.5GT/s] */
-	HCA(MELLANOX, 0x6750), /* MT26448 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */
-	HCA(MELLANOX, 0x6372), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe 2.0 2.5GT/s] */
-	HCA(MELLANOX, 0x675a), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe Gen2 5GT/s] */
-	HCA(MELLANOX, 0x6764), /* MT26468 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] */
-	HCA(MELLANOX, 0x6746), /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
-	HCA(MELLANOX, 0x676e), /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
-	HCA(MELLANOX, 0x6778), /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
-	HCA(MELLANOX, 0x1000),
-	HCA(MELLANOX, 0x1001),
-	HCA(MELLANOX, 0x1002),
-	HCA(MELLANOX, 0x1003),
-	HCA(MELLANOX, 0x1004),
-	HCA(MELLANOX, 0x1005),
-	HCA(MELLANOX, 0x1006),
-	HCA(MELLANOX, 0x1007),
-	HCA(MELLANOX, 0x1008),
-	HCA(MELLANOX, 0x1009),
-	HCA(MELLANOX, 0x100a),
-	HCA(MELLANOX, 0x100b),
-	HCA(MELLANOX, 0x100c),
-	HCA(MELLANOX, 0x100d),
-	HCA(MELLANOX, 0x100e),
-	HCA(MELLANOX, 0x100f),
+	HCA(MELLANOX, 0x6368),	/* MT25408 "Hermon" EN 10GigE */
+	HCA(MELLANOX, 0x6750),	/* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	HCA(MELLANOX, 0x6372),	/* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	HCA(MELLANOX, 0x675a),	/* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+	HCA(MELLANOX, 0x6764),	/* MT26468 ConnectX EN 10GigE PCIe gen2*/
+	HCA(MELLANOX, 0x6746),	/* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+	HCA(MELLANOX, 0x676e),	/* MT26478 ConnectX2 40GigE PCIe gen2 */
+	HCA(MELLANOX, 0x1002),	/* MT25400 Family [ConnectX-2 Virtual Function] */
+	HCA(MELLANOX, 0x1003),	/* MT27500 Family [ConnectX-3] */
+	HCA(MELLANOX, 0x1004),	/* MT27500 Family [ConnectX-3 Virtual Function] */
+	HCA(MELLANOX, 0x1005),	/* MT27510 Family */
+	HCA(MELLANOX, 0x1006),	/* MT27511 Family */
+	HCA(MELLANOX, 0x1007),	/* MT27520 Family */
+	HCA(MELLANOX, 0x1008),	/* MT27521 Family */
+	HCA(MELLANOX, 0x1009),	/* MT27530 Family */
+	HCA(MELLANOX, 0x100a),	/* MT27531 Family */
+	HCA(MELLANOX, 0x100b),	/* MT27540 Family */
+	HCA(MELLANOX, 0x100c),	/* MT27541 Family */
+	HCA(MELLANOX, 0x100d),	/* MT27550 Family */
+	HCA(MELLANOX, 0x100e),	/* MT27551 Family */
+	HCA(MELLANOX, 0x100f),	/* MT27560 Family */
+	HCA(MELLANOX, 0x1010),	/* MT27561 Family */
 };
 
-#ifdef HAVE_IBV_MORE_OPS
-static struct ibv_more_ops mlx4_more_ops = {
-#ifdef HAVE_IBV_XRC_OPS
-	.create_xrc_srq   = mlx4_create_xrc_srq,
-	.open_xrc_domain  = mlx4_open_xrc_domain,
-	.close_xrc_domain = mlx4_close_xrc_domain,
-	.create_xrc_rcv_qp = mlx4_create_xrc_rcv_qp,
-	.modify_xrc_rcv_qp = mlx4_modify_xrc_rcv_qp,
-	.query_xrc_rcv_qp = mlx4_query_xrc_rcv_qp,
-	.reg_xrc_rcv_qp   = mlx4_reg_xrc_rcv_qp,
-	.unreg_xrc_rcv_qp = mlx4_unreg_xrc_rcv_qp,
-#endif
-};
-#endif
-
 static struct ibv_context_ops mlx4_ctx_ops = {
 	.query_device  = mlx4_query_device,
 	.query_port    = mlx4_query_port,
@@ -114,8 +106,11 @@
 	.dealloc_pd    = mlx4_free_pd,
 	.reg_mr	       = mlx4_reg_mr,
 	.dereg_mr      = mlx4_dereg_mr,
+	.alloc_mw	   = mlx4_alloc_mw,
+	.dealloc_mw    = mlx4_dealloc_mw,
+	.bind_mw       = mlx4_bind_mw,
 	.create_cq     = mlx4_create_cq,
-	.poll_cq       = mlx4_poll_cq,
+	.poll_cq       = mlx4_poll_ibv_cq,
 	.req_notify_cq = mlx4_arm_cq,
 	.cq_event      = mlx4_cq_event,
 	.resize_cq     = mlx4_resize_cq,
@@ -137,150 +132,592 @@
 	.detach_mcast  = ibv_cmd_detach_mcast
 };
 
-static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, int cmd_fd)
+static int read_number_from_line(const char *line, int *value)
 {
-	struct mlx4_context	       *context;
-	struct ibv_get_context		cmd;
-	struct mlx4_alloc_ucontext_resp resp;
-	struct mlx4_alloc_ucontext_resp_v3 resp_v3;
-	int				i;
-	struct ibv_device_attr		dev_attrs;
-	unsigned int			bf_reg_size;
+	const char *ptr;
 
-	context = calloc(1, sizeof *context);
-	if (!context)
-		return NULL;
+	ptr = strchr(line, ':');
+	if (!ptr)
+		return 1;
+
+	++ptr;
+
+	*value = atoi(ptr);
+	return 0;
+}
+
+static int mlx4_is_sandy_bridge(int *num_cores)
+{
+	char line[128];
+	FILE *fd;
+	int rc = 0;
+	int cur_cpu_family = -1;
+	int cur_cpu_model = -1;
+
+	fd = fopen("/proc/cpuinfo", "r");
+	if (!fd)
+		return 0;
+
+	*num_cores = 0;
+
+	while (fgets(line, 128, fd)) {
+		int value;
+
+		/* if this is information on new processor */
+		if (!strncmp(line, "processor", 9)) {
+			++*num_cores;
+
+			cur_cpu_family = -1;
+			cur_cpu_model  = -1;
+		} else if (!strncmp(line, "cpu family", 10)) {
+			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
+				cur_cpu_family = value;
+		} else if (!strncmp(line, "model", 5)) {
+			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
+				cur_cpu_model = value;
+		}
+
+		/* if this is a Sandy Bridge CPU */
+		if ((cur_cpu_family == 6) &&
+		    (cur_cpu_model == 0x2A || cur_cpu_model == 0x2D))
+			rc = 1;
+	}
+
+	fclose(fd);
+	return rc;
+}
+
+static void mlx4_check_numa_enabled(struct ibv_context *context)
+{
+	char fname[MAXPATHLEN];
+	char buf[128];
+	FILE *fp;
+	int numa_enabled;
+	char env[VERBS_MAX_ENV_VAL];
+
+	snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/numa_node",
+		 ibv_get_device_name(context->device));
+
+	fp = fopen(fname, "r");
+	if (!fp) {
+		fprintf(stderr, PFX "Warning: can not check if NUMA is enabled "
+			"on node: failed to open %s\n", fname);
+		return;
+	}
+
+	if (!fgets(buf, sizeof(buf), fp)) {
+		fprintf(stderr, PFX "Warning: can not check if NUMA is enabled "
+			"on node: failed to read numa node value\n");
+		goto out;
+	}
+
+	numa_enabled = (strtol(buf, 0, 10) >= 0);
+	if (numa_enabled)
+		printf(PFX "Device NUMA node detection is supported\n");
+	else if (ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env, sizeof(env)))
+		printf(PFX "Warning: Device NUMA node detection is not supported. "
+		       "Please consider setting the environment variable "
+			"'MLX4_LOCAL_CPUS' or enable ACPI SLIT\n");
+out:
+	fclose(fp);
+}
+
+static void dump_cpu_set(cpuset_t *cpu_set)
+{
+	int i;
+	int first_cpu = -1;
+	int last_cpu = -1;
+	int n = 0;
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (CPU_ISSET(i, cpu_set)) {
+			if (first_cpu < 0)
+				first_cpu = i;
+			if (i == CPU_SETSIZE - 1)
+				last_cpu = i;
+		} else if (first_cpu >= 0)
+			last_cpu = i - 1;
+
+		if (last_cpu >= 0) {
+			if (first_cpu != last_cpu)
+				printf("%s%d-%d", n ? "," : "", first_cpu,
+				       last_cpu);
+			else
+				printf("%s%d", n ? "," : "", last_cpu);
+
+			first_cpu = -1;
+			last_cpu = -1;
+			++n;
+		}
+	}
+}
+
+/*
+man cpuset
+
+  This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
+  are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
+  words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
+  within a word are also in big-endian order.
+
+  The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
+  the size of the bitmask.
+
+  Examples of the Mask Format:
+
+     00000001                        # just bit 0 set
+     40000000,00000000,00000000      # just bit 94 set
+     000000ff,00000000               # bits 32-39 set
+     00000000,000E3862               # 1,5,6,11-13,17-19 set
+
+  A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
+
+     00000001,00000001,00010117
+
+  The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
+  bit 4, and the "7" is for bits 2, 1, and 0.
+*/
+static void mlx4_local_cpu_set(struct ibv_context *context, cpuset_t *cpu_set)
+{
+	char *p, buf[1024];
+	char env_value[VERBS_MAX_ENV_VAL];
+	uint32_t word;
+	int i, k;
+
+	if (mlx4_trace)
+		mlx4_check_numa_enabled(context);
+
+	if (!ibv_exp_cmd_getenv(context, "MLX4_LOCAL_CPUS", env_value, sizeof(env_value))) {
+		strncpy(buf, env_value, sizeof(buf));
+		if (mlx4_trace)
+			printf(PFX "Local CPUs flags were override by %s\n", buf);
+	} else {
+		char fname[MAXPATHLEN];
+		FILE *fp;
+
+		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/local_cpus",
+			 ibv_get_device_name(context->device));
+
+		fp = fopen(fname, "r");
+		if (!fp) {
+			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
+			return;
+		}
+		if (!fgets(buf, sizeof(buf), fp)) {
+			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to read cpu mask\n");
+			fclose(fp);
+			return;
+		}
+		fclose(fp);
+	}
 
-	context->ibv_ctx.cmd_fd = cmd_fd;
+	p = strrchr(buf, ',');
+	if (!p)
+		p = buf;
 
-	if (to_mdev(ibdev)->driver_abi_ver > 3) {
-		if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
-					&resp.ibv_resp, sizeof resp))
-			goto err_free;
+	i = 0;
+	do {
+		if (*p == ',') {
+			*p = 0;
+			p ++;
+		}
+
+		word = strtoul(p, 0, 16);
+
+		for (k = 0; word; ++k, word >>= 1)
+			if (word & 1)
+				CPU_SET(k+i, cpu_set);
+
+		if (p == buf)
+			break;
+
+		p = strrchr(buf, ',');
+		if (!p)
+			p = buf;
+
+		i += 32;
+	} while (i < CPU_SETSIZE);
+}
+
+static int mlx4_enable_sandy_bridge_fix(struct ibv_context *context)
+{
+	cpuset_t my_cpus, dev_local_cpus, result_set;
+	int stall_enable;
+	int ret;
+	int num_cores;
+
+	if (!mlx4_is_sandy_bridge(&num_cores))
+		return 0;
+
+	/* by default disable stall on sandy bridge arch */
+	stall_enable = 0;
+
+	/*
+	 * check if app is bound to cpu set that is inside
+	 * of device local cpu set. Disable stalling if true
+	 */
+
+	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
+	CPU_ZERO(&my_cpus);
+	CPU_ZERO(&dev_local_cpus);
+	CPU_ZERO(&result_set);
+	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
+	    sizeof(my_cpus), &my_cpus);
+	if (ret == -1) {
+		if (errno == EINVAL)
+			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
+		else
+			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
+		goto out;
+	}
+
+	if (mlx4_trace) {
+		printf(PFX "Running on cpus: ");
+		dump_cpu_set(&my_cpus);
+		printf("\n");
+	}
+
+	/* get device local cpu set */
+	mlx4_local_cpu_set(context, &dev_local_cpus);
+
+	/* make sure result_set is not init to all 0 */
+	CPU_SET(0, &result_set);
+	/* Set stall_enable if my cpu set and dev cpu set are disjoint sets */
+	CPU_AND(&result_set, &my_cpus);
+	CPU_AND(&result_set, &dev_local_cpus);
+	stall_enable = CPU_COUNT(&result_set) ? 0 : 1;
+
+	if (mlx4_trace) {
+		printf(PFX "HCA:%s local cpus: ", ibv_get_device_name(context->device));
+		dump_cpu_set(&dev_local_cpus);
+		printf("\n");
+		if (CPU_COUNT(&my_cpus) == num_cores) {
+			printf(PFX "Warning: CPU affinity wasn't used for this "
+				   "process, if the system has more than one numa node, it might be using a remote one.\n");
+			printf(PFX "         For achieving better performance, "
+				   "please consider setting the CPU "
+				   "affinity.\n");
+		}
+	}
+
+out:
+	if (mlx4_trace)
+		printf(PFX "Sandy Bridge CPU was detected, cq_stall is %s\n",
+		       stall_enable ? "enabled" : "disabled");
+
+	return stall_enable;
+}
+
+static void mlx4_read_env(struct ibv_device *ibdev, struct mlx4_context *ctx)
+{
+	char env_value[VERBS_MAX_ENV_VAL];
+
+	if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_TRACE", env_value, sizeof(env_value)) &&
+	    (strcmp(env_value, "0")))
+		mlx4_trace = 1;
+
+	if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_CQ_POLL", env_value, sizeof(env_value)) &&
+	    !strcmp(env_value, "0"))
+		/* check if cq stall is overrided by user */
+		ctx->stall_enable = 0;
+	else
+		/* autodetect if we need to do cq polling */
+		ctx->stall_enable = mlx4_enable_sandy_bridge_fix(&ctx->ibv_ctx);
+
+	if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_STALL_NUM_LOOP", env_value, sizeof(env_value)))
+		mlx4_stall_num_loop = atoi(env_value);
+
+	if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_SINGLE_THREADED", env_value, sizeof(env_value)))
+		mlx4_single_threaded = strcmp(env_value, "1") ? 0 : 1;
+
+	if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx,
+				"MLX4_USE_MUTEX",
+				env_value,
+				sizeof(env_value)))
+		mlx4_use_mutex = strcmp(env_value, "1") ? 0 : 1;
+}
+
+void read_init_vars(struct mlx4_context *ctx)
+{
+	char env_value[VERBS_MAX_ENV_VAL];
+
+	pthread_mutex_lock(&ctx->env_mtx);
+	if (!ctx->env_initialized) {
+		mlx4_read_env(ctx->ibv_ctx.device, ctx);
+		if (!ibv_exp_cmd_getenv(&ctx->ibv_ctx, "MLX4_POST_SEND_PREFER_BF", env_value, sizeof(env_value))) {
+			ctx->prefer_bf = !!strcmp(env_value, "0");
+			if (mlx4_trace)
+				printf(PFX "prefer_bf=%d\n", ctx->prefer_bf);
+		} else {
+			ctx->prefer_bf = 1;
+		}
 
-		context->num_qps	= resp.qp_tab_size;
-		context->num_xrc_srqs	= resp.qp_tab_size;
-		bf_reg_size		= resp.bf_reg_size;
-		context->cqe_size	= resp.cqe_size;
+		ctx->env_initialized = 1;
+	}
+	pthread_mutex_unlock(&ctx->env_mtx);
+}
+
+static int mlx4_init_context(struct verbs_device *v_device,
+			     struct ibv_context *ibv_ctx, int cmd_fd)
+{
+	struct mlx4_context	        *context;
+	struct mlx4_alloc_ucontext_req  req;
+	struct mlx4_alloc_ucontext_resp resp;
+	struct mlx4_alloc_ucontext_resp_v3 resp_v3;
+	int				i;
+	struct ibv_exp_device_attr	dev_attrs;
+	struct ibv_device_attr	           dev_legacy_attrs;
+	struct mlx4_device		*dev = to_mdev(&v_device->device);
+	unsigned int			qp_tab_size;
+	unsigned int			bf_reg_size;
+	unsigned int			cqe_size;
+	int				hca_clock_offset;
+	void				*hca_clock_page = NULL;
+
+	/* verbs_context should be used for new verbs.
+	 * memory footprint of mlx4_context and verbs_context share
+	 * struct ibv_context.
+	 */
+	struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
+	struct verbs_context_exp *verbs_exp_ctx = verbs_get_exp_ctx(ibv_ctx);
+
+	memset(&req, 0, sizeof(req));
+	context = to_mctx(ibv_ctx);
+	ibv_ctx->cmd_fd = cmd_fd;
+	ibv_ctx->device = &v_device->device;
+
+	if (pthread_mutex_init(&context->env_mtx, NULL))
+		return EIO;
+
+	if (dev->driver_abi_ver > 3) {
+#ifdef MLX4_WQE_FORMAT
+		req.lib_caps = MLX4_USER_DEV_CAP_WQE_FORMAT;
+#endif
+		if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req),
+					&resp.ibv_resp, sizeof(resp)))
+			return errno;
+
+		VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp));
+		qp_tab_size			= resp.qp_tab_size;
+		bf_reg_size			= resp.bf_reg_size;
+		context->bf_regs_per_page	= resp.bf_regs_per_page;
+		cqe_size			= resp.cqe_size;
 	} else {
-		if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
-					&resp_v3.ibv_resp, sizeof resp_v3))
-			goto err_free;
-
-		context->num_qps	= resp_v3.qp_tab_size;
-		context->num_xrc_srqs	= resp_v3.qp_tab_size;
-		bf_reg_size		= resp_v3.bf_reg_size;
-		context->cqe_size	= 32;
+		if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req.cmd),
+					&resp_v3.ibv_resp, sizeof(resp_v3)))
+			return errno;
+
+		VALGRIND_MAKE_MEM_DEFINED(&resp_v3, sizeof(resp_v3));
+		qp_tab_size			= resp_v3.qp_tab_size;
+		bf_reg_size			= resp_v3.bf_reg_size;
+		context->bf_regs_per_page	= resp_v3.bf_regs_per_page;
+		cqe_size			= 32;
 	}
 
+	context->num_qps	= qp_tab_size;
 	context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS;
 	context->qp_table_mask	= (1 << context->qp_table_shift) - 1;
+	context->cqe_size = cqe_size;
+	for (i = 0; i < MLX4_PORTS_NUM; ++i)
+		context->port_query_cache[i].valid = 0;
 
 	pthread_mutex_init(&context->qp_table_mutex, NULL);
 	for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
 		context->qp_table[i].refcnt = 0;
 
-	context->xrc_srq_table_shift = ffs(context->num_xrc_srqs) - 1
-				       - MLX4_XRC_SRQ_TABLE_BITS;
-	context->xrc_srq_table_mask = (1 << context->xrc_srq_table_shift) - 1;
-
-	pthread_mutex_init(&context->xrc_srq_table_mutex, NULL);
-	for (i = 0; i < MLX4_XRC_SRQ_TABLE_SIZE; ++i)
-		context->xrc_srq_table[i].refcnt = 0;
-
 	for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
 		context->db_list[i] = NULL;
 
+	mlx4_init_xsrq_table(&context->xsrq_table, qp_tab_size);
 	pthread_mutex_init(&context->db_list_mutex, NULL);
 
-	context->uar = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE,
+	context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
 			    MAP_SHARED, cmd_fd, 0);
 	if (context->uar == MAP_FAILED)
-		goto err_free;
+		return errno;
 
 	if (bf_reg_size) {
-		context->bf_page = mmap(NULL, to_mdev(ibdev)->page_size,
-					PROT_WRITE, MAP_SHARED, cmd_fd,
-					to_mdev(ibdev)->page_size);
-		if (context->bf_page == MAP_FAILED) {
+		context->bfs.page = mmap(NULL, dev->page_size,
+					 PROT_WRITE, MAP_SHARED, cmd_fd,
+					 dev->page_size);
+		if (context->bfs.page == MAP_FAILED) {
 			fprintf(stderr, PFX "Warning: BlueFlame available, "
 				"but failed to mmap() BlueFlame page.\n");
-				context->bf_page     = NULL;
-				context->bf_buf_size = 0;
+			context->bfs.page		= NULL;
+			context->bfs.buf_size		= 0;
+			context->bfs.num_dedic_bfs	= 0;
 		} else {
-			context->bf_buf_size = bf_reg_size / 2;
-			context->bf_offset   = 0;
-			pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
+			context->bfs.num_dedic_bfs = min(context->bf_regs_per_page - 1,
+							 MLX4_MAX_BFS_IN_PAGE - 1);
+			context->bfs.buf_size = bf_reg_size / 2;
+			mlx4_spinlock_init(&context->bfs.dedic_bf_lock, !mlx4_single_threaded);
+			context->bfs.cmn_bf.address = context->bfs.page;
+
+			mlx4_lock_init(&context->bfs.cmn_bf.lock,
+				       !mlx4_single_threaded,
+				       mlx4_get_locktype());
+
+			context->bfs.dedic_bf_free = context->bfs.num_dedic_bfs;
+			for (i = 0; i < context->bfs.num_dedic_bfs; i++) {
+				context->bfs.dedic_bf[i].address   = context->bfs.page + (i + 1) * MLX4_BFS_STRIDE;
+				context->bfs.dedic_bf_used[i] = 0;
+			}
 		}
 	} else {
-		context->bf_page     = NULL;
-		context->bf_buf_size = 0;
+		context->bfs.page		= NULL;
+		context->bfs.buf_size		= 0;
+		context->bfs.num_dedic_bfs	= 0;
 	}
 
-	pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
+	mlx4_spinlock_init(&context->uar_lock, !mlx4_single_threaded);
 
-	context->ibv_ctx.ops = mlx4_ctx_ops;
-#ifdef HAVE_IBV_XRC_OPS
-	context->ibv_ctx.more_ops = &mlx4_more_ops;
-#endif
+	mlx4_spinlock_init(&context->send_db_lock, !mlx4_single_threaded);
+	INIT_LIST_HEAD(&context->send_db_list);
+
+	mlx4_spinlock_init(&context->hugetlb_lock, !mlx4_single_threaded);
+	INIT_LIST_HEAD(&context->hugetlb_list);
 
-	if (mlx4_query_device(&context->ibv_ctx, &dev_attrs))
-		goto query_free;
+	pthread_mutex_init(&context->task_mutex, NULL);
+
+	memset(&dev_attrs, 0, sizeof(dev_attrs));
+	dev_attrs.comp_mask = IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK |
+			      IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK |
+			      IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
+			      IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN;
+
+	if (mlx4_exp_query_device(ibv_ctx, &dev_attrs)) {
+		if (mlx4_query_device(ibv_ctx, &dev_legacy_attrs))
+			goto query_free;
+
+		memcpy(&dev_attrs, &dev_legacy_attrs, sizeof(dev_legacy_attrs));
+	}
 
 	context->max_qp_wr = dev_attrs.max_qp_wr;
 	context->max_sge = dev_attrs.max_sge;
 	context->max_cqe = dev_attrs.max_cqe;
-	if (!(dev_attrs.device_cap_flags & IBV_DEVICE_XRC)) {
-		fprintf(stderr, PFX "There is a mismatch between "
-		        "the kernel and the userspace libraries: "
-			"Kernel does not support XRC. Exiting.\n");
-		goto query_free;
+	context->exp_device_cap_flags = dev_attrs.exp_device_cap_flags;
+	if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN)
+		context->max_ctx_res_domain = dev_attrs.max_ctx_res_domain;
+
+	VALGRIND_MAKE_MEM_DEFINED(&context->hca_core_clock, sizeof(context->hca_core_clock));
+	if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) {
+		if (dev_attrs.hca_core_clock)
+			context->core_clk.mult = ((1ull * 1000) << 29) /
+						dev_attrs.hca_core_clock;
+		else
+			context->core_clk.mult = 0;
+
+		context->core_clk.shift = 29;
+		context->core_clk.mask = dev_attrs.timestamp_mask;
+
+		if (ioctl(cmd_fd, MLX4_IOCHWCLOCKOFFSET,
+			  &hca_clock_offset) >= 0) {
+			VALGRIND_MAKE_MEM_DEFINED(&hca_clock_offset, sizeof(hca_clock_offset));
+			context->core_clk.offset = hca_clock_offset;
+			hca_clock_page = mmap(NULL, hca_clock_offset +
+					sizeof(context->core_clk.mask),
+					PROT_READ, MAP_SHARED, cmd_fd,
+					dev->page_size *
+					(MLX4_IB_MMAP_GET_HW_CLOCK));
+
+			if (hca_clock_page == MAP_FAILED) {
+				fprintf(stderr, PFX
+					"Warning: Timestamp available,\n"
+					"but failed to mmap() hca core  "
+					"clock page.\n");
+			} else {
+				context->hca_core_clock = hca_clock_page +
+					context->core_clk.offset;
+			}
+		}
 	}
 
-	return &context->ibv_ctx;
+	ibv_ctx->ops = mlx4_ctx_ops;
+
+	verbs_ctx->has_comp_mask |= VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ |
+				    VERBS_CONTEXT_QP;
+
+	verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd);
+	verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd);
+	verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex);
+	verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num);
+	verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex);
+	verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp);
+	verbs_set_ctx_op(verbs_ctx, create_flow, ibv_cmd_create_flow);
+	verbs_set_ctx_op(verbs_ctx, destroy_flow, ibv_cmd_destroy_flow);
+
+	/*
+	 * Set experimental verbs
+	 */
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_reg_shared_mr, mlx4_reg_shared_mr);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_flow, ibv_exp_cmd_create_flow);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_destroy_flow, ibv_exp_cmd_destroy_flow);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_ah, mlx4_exp_create_ah);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_device, mlx4_exp_query_device);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_create_qp, mlx4_exp_create_qp);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_qp, mlx4_exp_modify_qp);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_port, mlx4_exp_query_port);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_cq, mlx4_modify_cq);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_task, mlx4_post_task);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_set_legacy_xrc, mlx4_set_legacy_xrc);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_get_legacy_xrc, mlx4_get_legacy_xrc);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx4_exp_poll_cq);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_cq, mlx4_create_cq_ex);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_values, mlx4_query_values);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_reg_mr, mlx4_exp_reg_mr);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_send, mlx4_exp_post_send);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_bind_mw, mlx4_exp_bind_mw);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_rereg_mr, mlx4_exp_rereg_mr);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_dereg_mr, mlx4_exp_dereg_mr);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_res_domain, mlx4_exp_create_res_domain);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_res_domain, mlx4_exp_destroy_res_domain);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, exp_query_intf, mlx4_exp_query_intf);
+	verbs_set_exp_ctx_op(verbs_exp_ctx, exp_release_intf, mlx4_exp_release_intf);
+
+	return 0;
 
 query_free:
-	munmap(context->uar, to_mdev(ibdev)->page_size);
-	if (context->bf_page)
-		munmap(context->bf_page, to_mdev(ibdev)->page_size);
-
-err_free:
-	free(context);
-	return NULL;
+	munmap(context->uar, dev->page_size);
+	if (context->bfs.page)
+		munmap(context->bfs.page, dev->page_size);
+	if (hca_clock_page)
+		munmap(hca_clock_page, hca_clock_offset +
+		       sizeof(context->core_clk.mask));
+
+	return errno;
 }
 
-static void mlx4_free_context(struct ibv_context *ibctx)
+static void mlx4_uninit_context(struct verbs_device *v_device,
+				struct ibv_context *ibv_ctx)
 {
-	struct mlx4_context *context = to_mctx(ibctx);
-
-	munmap(context->uar, to_mdev(ibctx->device)->page_size);
-	if (context->bf_page)
-		munmap(context->bf_page, to_mdev(ibctx->device)->page_size);
-	free(context);
+	struct mlx4_context *context = to_mctx(ibv_ctx);
+
+	munmap(context->uar, to_mdev(&v_device->device)->page_size);
+	if (context->bfs.page)
+		munmap(context->bfs.page,
+		       to_mdev(&v_device->device)->page_size);
+	if (context->hca_core_clock)
+		munmap((context->hca_core_clock - context->core_clk.offset),
+		       context->core_clk.offset + sizeof(context->core_clk.mask));
 }
 
-static struct ibv_device_ops mlx4_dev_ops = {
-	.alloc_context = mlx4_alloc_context,
-	.free_context  = mlx4_free_context
-};
-
-static struct ibv_device *mlx4_driver_init(const char *uverbs_sys_path,
-					    int abi_version)
+static struct verbs_device *mlx4_driver_init(const char *uverbs_sys_path,
+					     int abi_version)
 {
 	char			value[8];
-	struct mlx4_device    *dev;
+	struct mlx4_device	*dev;
 	unsigned		vendor, device;
 	int			i;
 
 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
 				value, sizeof value) < 0)
 		return NULL;
-	sscanf(value, "%i", &vendor);
+	vendor = strtol(value, NULL, 16);
 
 	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
 				value, sizeof value) < 0)
 		return NULL;
-	sscanf(value, "%i", &device);
+	device = strtol(value, NULL, 16);
 
 	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
 		if (vendor == hca_table[i].vendor &&
@@ -300,24 +737,32 @@
 		return NULL;
 	}
 
-	dev = malloc(sizeof *dev);
+	dev = calloc(1, sizeof(*dev));
 	if (!dev) {
 		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
 			uverbs_sys_path);
 		return NULL;
 	}
 
-	dev->ibv_dev.ops = mlx4_dev_ops;
 	dev->page_size   = sysconf(_SC_PAGESIZE);
+
+	dev->devid.id = device;
 	dev->driver_abi_ver = abi_version;
 
-	return &dev->ibv_dev;
+	dev->verbs_dev.sz = sizeof(*dev);
+	dev->verbs_dev.size_of_context =
+		sizeof(struct mlx4_context) - sizeof(struct ibv_context);
+	/* mlx4_init_context will initialize provider calls */
+	dev->verbs_dev.init_context = mlx4_init_context;
+	dev->verbs_dev.uninit_context = mlx4_uninit_context;
+
+	return &dev->verbs_dev;
 }
 
 #ifdef HAVE_IBV_REGISTER_DRIVER
 static __attribute__((constructor)) void mlx4_register_driver(void)
 {
-	ibv_register_driver("mlx4", mlx4_driver_init);
+	verbs_register_driver("mlx4", mlx4_driver_init);
 }
 #else
 /*
Index: contrib/ofed/libmlx4/src/mlx4_exp.h
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/mlx4_exp.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_EXP_H
+#define MLX4_EXP_H
+
+#include <infiniband/kern-abi_exp.h>
+#include "mlx4.h"
+
+/*
+ * mlx4-abi experimental structs
+ */
+struct mlx4_exp_create_qp {
+	struct ibv_exp_create_qp		ibv_cmd;
+	struct mlx4_exp_create_qp_provider	exp_cmd;
+};
+
+struct mlx4_exp_create_cq {
+	struct ibv_exp_create_cq	ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+};
+
+/*
+ * Experimental functions
+ */
+struct ibv_qp *mlx4_exp_create_qp(struct ibv_context *context,
+				  struct ibv_exp_qp_init_attr *attr);
+int mlx4_exp_query_device(struct ibv_context *context,
+			  struct ibv_exp_device_attr *attr);
+int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num,
+			struct ibv_exp_port_attr *port_attr);
+int mlx4_exp_modify_cq(struct ibv_cq *cq, struct ibv_exp_cq_attr *attr,
+		       int attr_mask);
+int mlx4_exp_rereg_mr(struct ibv_mr *mr, int flags, struct ibv_pd *pd,
+		      void *addr, size_t length, uint64_t access,
+		      struct ibv_exp_rereg_mr_attr *attr, struct ibv_exp_rereg_out *out);
+int mlx4_exp_dereg_mr(struct ibv_mr *mr, struct ibv_exp_dereg_out *out);
+struct ibv_exp_res_domain *mlx4_exp_create_res_domain(struct ibv_context *context,
+						      struct ibv_exp_res_domain_init_attr *attr);
+int mlx4_exp_destroy_res_domain(struct ibv_context *context,
+				struct ibv_exp_res_domain *res_dom,
+				struct ibv_exp_destroy_res_domain_attr *attr);
+void *mlx4_exp_query_intf(struct ibv_context *context, struct ibv_exp_query_intf_params *params,
+			  enum ibv_exp_query_intf_status *status);
+int mlx4_exp_release_intf(struct ibv_context *context, void *intf,
+			  struct ibv_exp_release_intf_params *params);
+
+#endif /* MLX4_EXP_H */
Index: contrib/ofed/libmlx4/src/qp.c
===================================================================
--- contrib/ofed/libmlx4/src/qp.c
+++ contrib/ofed/libmlx4/src/qp.c
@@ -40,11 +40,40 @@
 #include <netinet/in.h>
 #include <pthread.h>
 #include <string.h>
+#include <errno.h>
 
 #include "mlx4.h"
 #include "doorbell.h"
 #include "wqe.h"
 
+#ifndef htobe64
+#include <endian.h>
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+# define htobe64(x) __bswap_64 (x)
+# else
+# define htobe64(x) (x)
+# endif
+#endif
+
+#ifdef MLX4_WQE_FORMAT
+	#define SET_BYTE_COUNT(byte_count) (htonl(byte_count) | owner_bit)
+	#define WQE_CTRL_OWN	(1 << 30)
+#else
+	#define SET_BYTE_COUNT(byte_count) htonl(byte_count)
+	#define WQE_CTRL_OWN	(1 << 31)
+#endif
+enum {
+	MLX4_OPCODE_BASIC	= 0x00010000,
+	MLX4_OPCODE_MANAGED	= 0x00020000,
+
+	MLX4_OPCODE_WITH_IMM	= 0x01000000
+};
+
+#define MLX4_IB_OPCODE(op, class, attr)     (((class) & 0x00FF0000) | ((attr) & 0xFF000000) | ((op) & 0x0000FFFF))
+#define MLX4_IB_OPCODE_GET_CLASS(opcode)    ((opcode) & 0x00FF0000)
+#define MLX4_IB_OPCODE_GET_OP(opcode)       ((opcode) & 0x0000FFFF)
+#define MLX4_IB_OPCODE_GET_ATTR(opcode)     ((opcode) & 0xFF000000)
+
 static const uint32_t mlx4_ib_opcode[] = {
 	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
 	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
@@ -55,14 +84,151 @@
 	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
 };
 
-static void *get_recv_wqe(struct mlx4_qp *qp, int n)
+
+static const uint32_t mlx4_ib_opcode_exp[] = {
+	[IBV_EXP_WR_SEND]                   = MLX4_IB_OPCODE(MLX4_OPCODE_SEND,                MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_SEND_WITH_IMM]          = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_IMM,            MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
+	[IBV_EXP_WR_RDMA_WRITE]             = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE,          MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_RDMA_WRITE_WITH_IMM]    = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_WRITE_IMM,      MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
+	[IBV_EXP_WR_RDMA_READ]              = MLX4_IB_OPCODE(MLX4_OPCODE_RDMA_READ,           MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_ATOMIC_CMP_AND_SWP]     = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_CS,           MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_ATOMIC_FETCH_AND_ADD]   = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_FA,           MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP]   = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_CS,  MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD] = MLX4_IB_OPCODE(MLX4_OPCODE_ATOMIC_MASK_FA,  MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_LOCAL_INV]              = MLX4_IB_OPCODE(MLX4_OPCODE_LOCAL_INVAL,	      MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_SEND_WITH_INV]          = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_INVAL,          MLX4_OPCODE_BASIC, MLX4_OPCODE_WITH_IMM),
+	[IBV_EXP_WR_BIND_MW]                = MLX4_IB_OPCODE(MLX4_OPCODE_BIND_MW,             MLX4_OPCODE_BASIC, 0),
+	[IBV_EXP_WR_SEND_ENABLE]            = MLX4_IB_OPCODE(MLX4_OPCODE_SEND_ENABLE,         MLX4_OPCODE_MANAGED, 0),
+	[IBV_EXP_WR_RECV_ENABLE]            = MLX4_IB_OPCODE(MLX4_OPCODE_RECV_ENABLE,         MLX4_OPCODE_MANAGED, 0),
+	[IBV_EXP_WR_CQE_WAIT]               = MLX4_IB_OPCODE(MLX4_OPCODE_CQE_WAIT,            MLX4_OPCODE_MANAGED, 0),
+};
+
+enum {
+	MLX4_CALC_FLOAT64_ADD   = 0x00,
+	MLX4_CALC_UINT64_ADD    = 0x01,
+	MLX4_CALC_UINT64_MAXLOC = 0x02,
+	MLX4_CALC_UINT64_AND    = 0x03,
+	MLX4_CALC_UINT64_XOR    = 0x04,
+	MLX4_CALC_UINT64_OR     = 0x05
+};
+
+enum {
+	MLX4_WQE_CTRL_CALC_OP = 26
+};
+
+static const struct mlx4_calc_op {
+	int valid;
+	uint32_t opcode;
+}  mlx4_calc_ops_table
+	[IBV_EXP_CALC_DATA_SIZE_NUMBER]
+		[IBV_EXP_CALC_OP_NUMBER]
+			[IBV_EXP_CALC_DATA_TYPE_NUMBER] = {
+	[IBV_EXP_CALC_DATA_SIZE_64_BIT] = {
+		[IBV_EXP_CALC_OP_ADD] = {
+			[IBV_EXP_CALC_DATA_TYPE_INT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_UINT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_ADD << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_FLOAT]  = {
+				.valid = 1,
+				.opcode = MLX4_CALC_FLOAT64_ADD << MLX4_WQE_CTRL_CALC_OP }
+		},
+		[IBV_EXP_CALC_OP_BXOR] = {
+			[IBV_EXP_CALC_DATA_TYPE_INT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_UINT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_FLOAT]  = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_XOR << MLX4_WQE_CTRL_CALC_OP }
+		},
+		[IBV_EXP_CALC_OP_BAND] = {
+			[IBV_EXP_CALC_DATA_TYPE_INT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_UINT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_FLOAT]  = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_AND << MLX4_WQE_CTRL_CALC_OP }
+		},
+		[IBV_EXP_CALC_OP_BOR] = {
+			[IBV_EXP_CALC_DATA_TYPE_INT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_UINT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP },
+			[IBV_EXP_CALC_DATA_TYPE_FLOAT]  = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_OR << MLX4_WQE_CTRL_CALC_OP }
+		},
+		[IBV_EXP_CALC_OP_MAXLOC] = {
+			[IBV_EXP_CALC_DATA_TYPE_UINT] = {
+				.valid = 1,
+				.opcode = MLX4_CALC_UINT64_MAXLOC << MLX4_WQE_CTRL_CALC_OP }
+		}
+	}
+};
+
+static int post_send_other(struct ibv_send_wr *wr,
+			   struct mlx4_qp *qp,
+			   void *wqe_add, int *total_size,
+			   int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_rc_raw_packet(struct ibv_send_wr *wr,
+				   struct mlx4_qp *qp,
+				   void *wqe_add, int *total_size,
+				   int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_ud(struct ibv_send_wr *wr,
+			struct mlx4_qp *qp,
+			void *wqe_add, int *total_size,
+			int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_rc_uc(struct ibv_send_wr *wr,
+			   struct mlx4_qp *qp,
+			   void *wqe_add, int *total_size,
+			   int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+static int post_send_xrc(struct ibv_send_wr *wr,
+			 struct mlx4_qp *qp,
+			 void *wqe_add, int *total_size,
+			 int *inl, unsigned int ind) __MLX4_ALGN_FUNC__;
+
+#define MLX4_WAIT_EN_VALID (1<<30)
+
+static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count) __attribute__((always_inline));
+static inline void set_wait_en_seg(void *wqe_seg, uint32_t obj_num, uint32_t count)
 {
-	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
+	struct mlx4_wqe_wait_en_seg *seg = (struct mlx4_wqe_wait_en_seg *)wqe_seg;
+
+	seg->valid   = htonl(MLX4_WAIT_EN_VALID);
+	seg->pi      = htonl(count);
+	seg->obj_num = htonl(obj_num);
+
+	return;
 }
 
-static void *get_send_wqe(struct mlx4_qp *qp, int n)
+static inline void *get_recv_wqe(struct mlx4_qp *qp, int n) __attribute__((always_inline));
+static inline void *get_recv_wqe(struct mlx4_qp *qp, int n)
+{
+	return qp->rq.buf + (n << qp->rq.wqe_shift);
+}
+
+void *mlx4_get_recv_wqe(struct mlx4_qp *qp, int n)
+{
+	return get_recv_wqe(qp, n);
+}
+
+static void *get_send_wqe64(struct mlx4_qp *qp, unsigned int n)
+{
+	return qp->sq.buf + (n << 6);
+}
+static void *get_send_wqe(struct mlx4_qp *qp, unsigned int n)
 {
-	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
+	return qp->sq.buf + (n << qp->sq.wqe_shift);
 }
 
 /*
@@ -70,7 +236,48 @@
  * first four bytes of every 64 byte chunk with 0xffffffff, except for
  * the very first chunk of the WQE.
  */
-static void stamp_send_wqe(struct mlx4_qp *qp, int n)
+void mlx4_init_qp_indices(struct mlx4_qp *qp)
+{
+	qp->sq.head	 = 0;
+	qp->sq.tail	 = 0;
+	qp->rq.head	 = 0;
+	qp->rq.tail	 = 0;
+	qp->sq.head_en_index = 0;
+	qp->sq.head_en_count = 0;
+	qp->rq.head_en_index = 0;
+	qp->rq.head_en_count = 0;
+}
+
+#ifdef MLX4_WQE_FORMAT
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
+{
+	__be32 *wqe = get_send_wqe(qp, 0);
+	int wq_size = (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	int i;
+
+	for (i = 0; i < wq_size; i += 64)
+		wqe[i / 4] = htonl(WQE_CTRL_OWN);
+}
+
+static void set_owner_wqe(struct mlx4_qp *qp, unsigned int idx, int ds,
+			  uint32_t owner_bit)
+{
+	uint32_t *wqe;
+	int max_sz = (1 << qp->sq.wqe_shift) / 4;
+	int cur_sz = ds * 4;
+	int tail_sz;
+	int i;
+
+	if (max_sz - cur_sz < 16)
+		return;
+
+	wqe = get_send_wqe(qp, idx & (qp->sq.wqe_cnt - 1));
+	tail_sz = max_sz - cur_sz;
+	for (i = 0; tail_sz > 16; i += 4, tail_sz -= 16)
+		wqe[cur_sz + i * 4] = owner_bit;
+}
+#else
+static void stamp_send_wqe(struct mlx4_qp *qp, unsigned int n)
 {
 	uint32_t *wqe = get_send_wqe(qp, n);
 	int i;
@@ -80,14 +287,6 @@
 		wqe[i] = 0xffffffff;
 }
 
-void mlx4_init_qp_indices(struct mlx4_qp *qp)
-{
-	qp->sq.head	 = 0;
-	qp->sq.tail	 = 0;
-	qp->rq.head	 = 0;
-	qp->rq.tail	 = 0;
-}
-
 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
 {
 	struct mlx4_wqe_ctrl_seg *ctrl;
@@ -95,29 +294,78 @@
 
 	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
 		ctrl = get_send_wqe(qp, i);
-		ctrl->owner_opcode = htonl(1 << 31);
+		ctrl->owner_opcode = htonl(WQE_CTRL_OWN);
 		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
 
 		stamp_send_wqe(qp, i);
 	}
 }
+#endif
 
-static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
+static int __wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) __attribute__((noinline));
+static int __wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp)
 {
+	struct mlx4_cq *cq = to_mcq(qp->verbs_qp.qp.send_cq);
 	unsigned cur;
 
+	mlx4_lock(&cq->lock);
 	cur = wq->head - wq->tail;
-	if (cur + nreq < wq->max_post)
-		return 0;
+	mlx4_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static inline int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp) __attribute__((always_inline));
+static inline int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_qp *qp)
+{
+	unsigned cur;
 
-	pthread_spin_lock(&cq->lock);
 	cur = wq->head - wq->tail;
-	pthread_spin_unlock(&cq->lock);
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
 
-	return cur + nreq >= wq->max_post;
+	return __wq_overflow(wq, nreq, qp);
+}
+
+static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_exp_send_wr *wr)
+{
+	uint64_t acc = wr->bind_mw.bind_info.exp_mw_access_flags;
+	bseg->flags1 = 0;
+	if (acc & IBV_EXP_ACCESS_REMOTE_ATOMIC)
+		bseg->flags1 |= htonl(MLX4_WQE_MW_ATOMIC);
+	if (acc & IBV_EXP_ACCESS_REMOTE_WRITE)
+		bseg->flags1 |= htonl(MLX4_WQE_MW_REMOTE_WRITE);
+	if (acc & IBV_EXP_ACCESS_REMOTE_READ)
+		bseg->flags1 |= htonl(MLX4_WQE_MW_REMOTE_READ);
+
+	bseg->flags2 = 0;
+	if (((struct verbs_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
+		bseg->flags2 |= htonl(MLX4_WQE_BIND_TYPE_2);
+	if (acc & IBV_EXP_ACCESS_MW_ZERO_BASED)
+		bseg->flags2 |= htonl(MLX4_WQE_BIND_ZERO_BASED);
+
+	bseg->new_rkey = htonl(wr->bind_mw.rkey);
+	bseg->lkey = htonl(wr->bind_mw.bind_info.mr->lkey);
+	bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
+	bseg->length = htobe64(wr->bind_mw.bind_info.length);
+}
+
+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
+		uint32_t rkey) __attribute__((always_inline));
+static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
+		uint32_t rkey)
+{
+	iseg->mem_key	= htonl(rkey);
+
+	iseg->reserved1    = 0;
+	iseg->reserved2    = 0;
+	iseg->reserved3[0] = 0;
+	iseg->reserved3[1] = 0;
 }
 
 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+				 uint64_t remote_addr, uint32_t rkey) __attribute__((always_inline));
+static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
 				 uint64_t remote_addr, uint32_t rkey)
 {
 	rseg->raddr    = htonll(remote_addr);
@@ -125,16 +373,33 @@
 	rseg->reserved = 0;
 }
 
-static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
+			   struct ibv_exp_send_wr *wr)
 {
-	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
+	struct ibv_exp_fetch_add *fa;
+
+	if (wr->exp_opcode == IBV_EXP_WR_ATOMIC_CMP_AND_SWP) {
 		aseg->swap_add = htonll(wr->wr.atomic.swap);
 		aseg->compare  = htonll(wr->wr.atomic.compare_add);
+	} else if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) {
+		fa = &wr->ext_op.masked_atomics.wr_data.inline_data.op.fetch_add;
+		aseg->swap_add = htonll(fa->add_val);
+		aseg->compare = htonll(fa->field_boundary);
 	} else {
 		aseg->swap_add = htonll(wr->wr.atomic.compare_add);
 		aseg->compare  = 0;
 	}
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+				  struct ibv_exp_send_wr *wr)
+{
+	struct ibv_exp_cmp_swap *cs = &wr->ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap;
 
+	aseg->swap_data = htonll(cs->swap_val);
+	aseg->cmp_data = htonll(cs->compare_val);
+	aseg->swap_mask = htonll(cs->swap_mask);
+	aseg->cmp_mask = htonll(cs->compare_mask);
 }
 
 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
@@ -147,14 +412,18 @@
 	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
 }
 
-static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+static  inline void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) __attribute__((always_inline));
+static  inline void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
 {
 	dseg->byte_count = htonl(sg->length);
 	dseg->lkey       = htonl(sg->lkey);
 	dseg->addr       = htonll(sg->addr);
 }
 
-static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg,
+				struct ibv_sge *sg, unsigned int owner_bit) __attribute__((always_inline));
+static inline void set_ptr_data(struct mlx4_wqe_data_seg *dseg,
+				struct ibv_sge *sg, unsigned int owner_bit)
 {
 	dseg->lkey       = htonl(sg->lkey);
 	dseg->addr       = htonll(sg->addr);
@@ -169,7 +438,10 @@
 	 */
 	wmb();
 
-	dseg->byte_count = htonl(sg->length);
+	if (likely(sg->length))
+		dseg->byte_count = SET_BYTE_COUNT(sg->length);
+	else
+		dseg->byte_count = htonl(0x80000000);
 }
 
 /*
@@ -177,84 +449,787 @@
  * implementations may use move-string-buffer assembler instructions,
  * which do not guarantee order of copying.
  */
-static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+#if defined(__amd64__)
+#define COPY_64B_WC(dst, src)		\
+	__asm__ __volatile__ (		\
+	" movdqa   (%1),%%xmm0\n"	\
+	" movdqa 16(%1),%%xmm1\n"	\
+	" movdqa 32(%1),%%xmm2\n"	\
+	" movdqa 48(%1),%%xmm3\n"	\
+	" movntdq %%xmm0,   (%0)\n"	\
+	" movntdq %%xmm1, 16(%0)\n"	\
+	" movntdq %%xmm2, 32(%0)\n"	\
+	" movntdq %%xmm3, 48(%0)\n"	\
+	: : "r" (dst), "r" (src) : "memory");	\
+	dst += 8;			\
+	src += 8
+#else
+#define COPY_64B_WC(dst, src)	\
+	*dst++ = *src++;	\
+	*dst++ = *src++;	\
+	*dst++ = *src++;	\
+	*dst++ = *src++;	\
+	*dst++ = *src++;	\
+	*dst++ = *src++;	\
+	*dst++ = *src++;	\
+	*dst++ = *src++
+#endif
+
+static void mlx4_bf_copy(uint64_t *dst, uint64_t *src, unsigned bytecnt)
 {
 	while (bytecnt > 0) {
-		*dst++ = *src++;
-		*dst++ = *src++;
-		bytecnt -= 2 * sizeof (long);
+		COPY_64B_WC(dst, src);
+		bytecnt -= 8 * sizeof(uint64_t);
+	}
+}
+
+/* Convert WQE format to fit BF usage */
+static inline void convert_to_bf_wqe(struct mlx4_qp *qp,
+				     struct mlx4_wqe_ctrl_seg *ctrl,
+				     const unsigned wqe_idx) __attribute__((always_inline));
+static inline void convert_to_bf_wqe(struct mlx4_qp *qp,
+				     struct mlx4_wqe_ctrl_seg *ctrl,
+				     const unsigned wqe_idx)
+{
+	uint32_t *tmp = (uint32_t *)ctrl->reserved;
+
+	ctrl->owner_opcode |= htonl((wqe_idx & 0xffff) << 8);
+	*tmp |= qp->doorbell_qpn;
+}
+
+static inline void copy_wqe_to_bf(struct mlx4_qp *qp,
+				  struct mlx4_wqe_ctrl_seg *ctrl,
+				  const int aligned_size,
+				  const unsigned wqe_idx,
+				  const int dedic_bf,
+				  const int one_thread_auto_evict) __attribute__((always_inline));
+static inline void copy_wqe_to_bf(struct mlx4_qp *qp,
+				  struct mlx4_wqe_ctrl_seg *ctrl,
+				  const int aligned_size,
+				  const unsigned wqe_idx,
+				  const int dedic_bf,
+				  const int one_thread_auto_evict)
+{
+	convert_to_bf_wqe(qp, ctrl, wqe_idx);
+
+	if (dedic_bf && one_thread_auto_evict)
+		/*
+		 * In case QP has dedicated BF, only one thread using this QP
+		 * and the CPU arch supports auto eviction of WC buffer we can move
+		 * the wc_wmb before the bf_copy (usually it is located after the bf_copy).
+		 * This provides significant improvement in message rate of small messages.
+		 * This barrier keeps BF toggling order by ensuring that previous BF data
+		 * is written to memory before writing to the next BF buffer.
+		 */
+		wc_wmb();
+	else
+		/*
+		 * Make sure that descriptor is written to memory
+		 * before writing to BlueFlame page.
+		 */
+		wmb();
+
+	if (dedic_bf) {
+		mlx4_bf_copy(qp->bf->dedic.address, (uint64_t *) ctrl, aligned_size);
+	} else {
+		mlx4_lock(&qp->bf->cmn.lock);
+		mlx4_bf_copy(qp->bf->cmn.address, (uint64_t *) ctrl, aligned_size);
+	}
+	if (!(dedic_bf && one_thread_auto_evict))
+		/*
+		 * This barrier ensures that BF data is written to memory
+		 * before toggling the BF buffer. This is to keep the right
+		 * toggling order and to prevent the case in which next BF data
+		 * will be written before the current BF data.
+		 * In addition this barrier ensures the eviction of the WC buffer.
+		 * See comment above for the conditions in which this barrier may be
+		 * set before the bf_copy.
+		 */
+		wc_wmb();
+
+	if (dedic_bf) {
+		/* Toggle BF buffer */
+		qp->bf->dedic.address = (void *)((uintptr_t)qp->bf->dedic.address ^ qp->bf_buf_size);
+	} else {
+		/* Toggle BF buffer */
+		qp->bf->cmn.address = (void *)((uintptr_t)qp->bf->cmn.address ^ qp->bf_buf_size);
+		mlx4_unlock(&qp->bf->cmn.lock);
+	}
+}
+
+static inline void __ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+			     int nreq, int size, int inl,
+			     const int use_bf, const int dedic_bf, const int one_thread_auto_evict,
+			     const int prefer_bf) __attribute__((always_inline));
+static inline void __ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+			     int nreq, int size, int inl,
+			     const int use_bf, const int dedic_bf, const int one_thread_auto_evict,
+			     const int prefer_bf)
+{
+	if (use_bf && nreq == 1 && (inl || prefer_bf) &&
+	    size > 1 && size <= qp->bf_buf_size / 16) {
+		copy_wqe_to_bf(qp, ctrl, align(size * 16, 64),
+			       qp->sq.head , dedic_bf,
+			       one_thread_auto_evict);
+		++qp->sq.head;
+	} else if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * ringing non-cached doorbell record.
+		 */
+		nc_wmb();
+		*qp->sdb = qp->doorbell_qpn;
+	}
+}
+
+static void __ring_db_mng(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+			  int nreq, int size, int inl) __attribute__((noinline));
+static void __ring_db_mng(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+			  int nreq, int size, int inl)
+{
+	struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context);
+
+	if (nreq == 1 && (inl || ctx->prefer_bf) && size > 1 && size <= qp->bf_buf_size / 16) {
+		convert_to_bf_wqe(qp, ctrl, qp->sq.head);
+
+		/*
+		 * Make sure that descriptor is written to memory
+		 * before writing to BlueFlame page.
+		 */
+		wmb();
+
+		++qp->sq.head;
+
+		wmb();
+
+	} else if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/* Controlled qp */
+		wmb();
+	}
+}
+
+static inline void ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+			   int nreq, int size, int inl) __attribute__((always_inline));
+static inline void ring_db(struct mlx4_qp *qp, struct mlx4_wqe_ctrl_seg *ctrl,
+			   int nreq, int size, int inl)
+{
+	if (unlikely(qp->create_flags & IBV_EXP_QP_CREATE_MANAGED_SEND))
+		return __ring_db_mng(qp, ctrl, nreq, size, inl);
+
+	switch (qp->db_method) {
+	case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB:
+		return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 1);
+	case MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB:
+		return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 1, 0);
+	case MLX4_QP_DB_METHOD_DEDIC_BF:
+		return __ring_db(qp, ctrl, nreq, size, inl, 1, 1, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
+	case MLX4_QP_DB_METHOD_BF:
+		return __ring_db(qp, ctrl, nreq, size, inl, 1, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
+	case MLX4_QP_DB_METHOD_DB:
+		return __ring_db(qp, ctrl, nreq, size, inl, 0, 0, 0, to_mctx(qp->verbs_qp.qp.context)->prefer_bf);
+	}
+}
+
+static void set_ctrl_seg(struct mlx4_wqe_ctrl_seg *ctrl, struct ibv_send_wr *wr,
+			 struct mlx4_qp *qp, uint32_t imm, uint32_t srcrb_flags,
+			 unsigned int owner_bit, int size, uint32_t wr_op)
+{
+	ctrl->srcrb_flags = srcrb_flags;
+	ctrl->imm = imm;
+	ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
+			    MLX4_WQE_CTRL_FENCE : 0) | size;
+
+	/*
+	 * Make sure descriptor is fully written before
+	 * setting ownership bit (because HW can start
+	 * executing as soon as we do).
+	 */
+	wmb();
+	ctrl->owner_opcode = htonl(wr_op) | owner_bit;
+}
+
+static inline int set_data_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+				   void *wqe, int *size, unsigned int owner_bit) __attribute__((always_inline));
+static inline int set_data_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+				   void *wqe, int *size, unsigned int owner_bit)
+{
+	struct mlx4_wqe_inline_seg *seg;
+	void *addr;
+	int len, seg_len;
+	int num_seg;
+	int off, to_copy;
+	int i;
+	int inl = 0;
+
+	seg = wqe;
+	wqe += sizeof(*seg);
+	off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
+	num_seg = 0;
+	seg_len = 0;
+
+	for (i = 0; i < num_sge; ++i) {
+		addr = (void *) (uintptr_t) sg_list[i].addr;
+		len  = sg_list[i].length;
+		inl += len;
+
+		if (unlikely(inl > qp->max_inline_data))
+			return ENOMEM;
+
+		while (len >= MLX4_INLINE_ALIGN - off) {
+			to_copy = MLX4_INLINE_ALIGN - off;
+			memcpy(wqe, addr, to_copy);
+			len -= to_copy;
+			wqe += to_copy;
+			addr += to_copy;
+			seg_len += to_copy;
+			wmb(); /* see comment below */
+			seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_len));
+			seg_len = 0;
+			seg = wqe;
+			wqe += sizeof(*seg);
+			off = sizeof(*seg);
+			++num_seg;
+		}
+
+		memcpy(wqe, addr, len);
+		wqe += len;
+		seg_len += len;
+		off += len;
+	}
+
+	if (likely(seg_len)) {
+		++num_seg;
+		/*
+		 * Need a barrier here to make sure
+		 * all the data is visible before the
+		 * byte_count field is set.  Otherwise
+		 * the HCA prefetcher could grab the
+		 * 64-byte chunk with this inline
+		 * segment and get a valid (!=
+		 * 0xffffffff) byte count but stale
+		 * data, and end up sending the wrong
+		 * data.
+		 */
+		wmb();
+		seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_len));
+	}
+
+	*size += (inl + num_seg * sizeof(*seg) + 15) / 16;
+
+	return 0;
+}
+
+static inline void set_data_inl_seg_fast(struct mlx4_qp *qp,
+					 void *addr, int length,
+					 void *wqe, int *size,
+					 unsigned int owner_bit) __attribute__((always_inline));
+static inline void set_data_inl_seg_fast(struct mlx4_qp *qp,
+					 void *addr, int length,
+					 void *wqe, int *size,
+					 unsigned int owner_bit)
+{
+	struct mlx4_wqe_inline_seg *seg;
+	static const int first_seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg) - sizeof(struct mlx4_wqe_ctrl_seg);
+	static const int seg_data_size = MLX4_INLINE_ALIGN - sizeof(*seg);
+
+	seg = wqe;
+	wqe += sizeof(*seg);
+
+	if (length <= first_seg_data_size) {
+		/* For the first segment there is no need to make sure
+		 * all the data is visible before the byte_count field is set.
+		 * This is because the ctrl segment at the beginning of the
+		 * segment covers HCA prefetcher issue.
+		 */
+		seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | length));
+
+		memcpy(wqe, addr, length);
+		*size += (length + sizeof(*seg) + 15) / 16;
+	} else {
+		void *start_wqe = seg;
+
+		seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | first_seg_data_size));
+		memcpy(wqe, addr, first_seg_data_size);
+		length -= first_seg_data_size;
+		addr += first_seg_data_size;
+		seg = (struct mlx4_wqe_inline_seg *)((char *)seg + MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg));
+		wqe += MLX4_INLINE_ALIGN - sizeof(struct mlx4_wqe_ctrl_seg);
+
+		while (length > seg_data_size) {
+			memcpy(wqe, addr, seg_data_size);
+			wmb(); /* see comment below */
+			seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | seg_data_size));
+			length -= seg_data_size ;
+			addr += seg_data_size;
+			seg = (struct mlx4_wqe_inline_seg *)((char *)seg + MLX4_INLINE_ALIGN);
+			wqe += MLX4_INLINE_ALIGN;
+		}
+		memcpy(wqe, addr, length);
+
+		/*
+		 * Need a barrier here to make sure
+		 * all the data is visible before the
+		 * byte_count field is set.  Otherwise
+		 * the HCA prefetcher could grab the
+		 * 64-byte chunk with this inline
+		 * segment and get a valid (!=
+		 * 0xffffffff) byte count but stale
+		 * data, and end up sending the wrong
+		 * data.
+		 */
+		wmb();
+		seg->byte_count = SET_BYTE_COUNT((MLX4_INLINE_SEG | length));
+		*size += (wqe + length - start_wqe + 15) / 16;
+	}
+}
+
+static inline void set_data_non_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+					void *wqe, int *size, unsigned int owner_bit) __attribute__((always_inline));
+static inline void set_data_non_inl_seg(struct mlx4_qp *qp, int num_sge, struct ibv_sge *sg_list,
+					void *wqe, int *size, unsigned int owner_bit)
+{
+	if (likely(num_sge == 1)) {
+		struct mlx4_wqe_data_seg *seg = wqe;
+
+		set_ptr_data(seg, sg_list, owner_bit);
+
+		*size += (sizeof(*seg) / 16);
+	} else {
+		struct mlx4_wqe_data_seg *seg = wqe;
+		int i;
+
+		for (i = num_sge - 1; i >= 0 ; --i)
+			set_ptr_data(seg + i, sg_list + i, owner_bit);
+
+		*size += num_sge * (sizeof(*seg) / 16);
+	}
+}
+
+static inline int set_data_seg(struct mlx4_qp *qp, void *seg, int *sz, int is_inl,
+			       int num_sge, struct ibv_sge *sg_list, int *inl,
+			       unsigned int owner_bit) __attribute__((always_inline));
+static inline int set_data_seg(struct mlx4_qp *qp, void *seg, int *sz, int is_inl,
+			       int num_sge, struct ibv_sge *sg_list, int *inl,
+			       unsigned int owner_bit)
+{
+	if (is_inl) {
+		/* inl is set to true if this is an inline data segment and num_sge > 0 */
+		*inl = num_sge > 0;
+		return set_data_inl_seg(qp, num_sge, sg_list, seg, sz,
+					owner_bit);
+	}
+	set_data_non_inl_seg(qp, num_sge, sg_list, seg, sz, owner_bit);
+
+	return 0;
+}
+
+static inline int set_common_segments(struct ibv_send_wr *wr, struct mlx4_qp *qp,
+				      uint32_t srcrb_flags, uint32_t imm,
+				      void *wqe, void *ctrl, int size, int *total_size,
+				      int *inl, unsigned int ind) __attribute__((always_inline));
+static inline int set_common_segments(struct ibv_send_wr *wr, struct mlx4_qp *qp,
+				      uint32_t srcrb_flags, uint32_t imm,
+				      void *wqe, void *ctrl, int size, int *total_size,
+				      int *inl, unsigned int ind)
+{
+	int ret;
+	unsigned int owner_bit = (ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0;
+
+	ret = set_data_seg(qp, wqe, &size, !!(wr->send_flags & IBV_SEND_INLINE),
+			   wr->num_sge, wr->sg_list, inl, owner_bit);
+	if (unlikely(ret))
+		return ret;
+
+	*total_size = size;
+	set_ctrl_seg(ctrl, wr, qp, imm, srcrb_flags, owner_bit, size,
+		     mlx4_ib_opcode[wr->opcode]);
+
+	return 0;
+
+}
+
+static int post_send_other(struct ibv_send_wr *wr,
+			   struct mlx4_qp *qp,
+			   void *wqe_add, int *total_size,
+			   int *inl, unsigned int ind)
+{
+	void *ctrl = wqe_add;
+	void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+	int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+	int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+		  (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+	uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+	uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+			wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+		       ? wr->imm_data : 0;
+
+	return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+
+}
+
+static int post_send_rc_raw_packet(struct ibv_send_wr *wr,
+				   struct mlx4_qp *qp,
+				   void *wqe_add, int *total_size,
+				   int *inl, unsigned int ind)
+{
+	void *ctrl = wqe_add;
+	void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+	union {
+		uint32_t srcrb_flags;
+		uint16_t srcrb_flags16[2];
+	} u;
+	uint32_t imm;
+	int idx;
+	int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+
+	/* Sanity check - prevent from posting empty SR */
+	if (unlikely(!wr->num_sge))
+		return EINVAL;
+
+	if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) {
+		/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+		* to indicate that no icrc should be calculated */
+		idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED;
+		u.srcrb_flags = htonl((uint32_t)(qp->srcrb_flags_tbl[idx] | MLX4_WQE_CTRL_SOLICIT));
+		/* For raw eth, take the dmac from the payload */
+		u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)wr->sg_list[0].addr;
+		imm = *(uint32_t *)((uintptr_t)(wr->sg_list[0].addr)+2);
+	} else {
+		idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+		      (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+		u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+
+		imm = (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+		       wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+		      ? wr->imm_data : 0;
+	}
+
+	return set_common_segments(wr, qp, u.srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+}
+
+static int post_send_ud(struct ibv_send_wr *wr,
+			struct mlx4_qp *qp,
+			void *wqe_add, int *total_size,
+			int *inl, unsigned int ind)
+{
+	void *ctrl = wqe_add;
+	void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+	int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+	int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+		  (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+	uint32_t srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+	uint32_t imm = (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+			wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+		       ? wr->imm_data : 0;
+
+	set_datagram_seg(wqe, wr);
+	wqe  += sizeof(struct mlx4_wqe_datagram_seg);
+	size += sizeof(struct mlx4_wqe_datagram_seg) / 16;
+
+	return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+}
+
+static inline int post_send_connected(struct ibv_send_wr *wr,
+				      struct mlx4_qp *qp,
+				      void *wqe_add, int *total_size,
+				      int *inl, unsigned int ind, int is_xrc) __attribute__((always_inline));
+static inline int post_send_connected(struct ibv_send_wr *wr,
+				      struct mlx4_qp *qp,
+				      void *wqe_add, int *total_size,
+				      int *inl, unsigned int ind, int is_xrc)
+{
+	void *ctrl = wqe_add;
+	void *wqe = wqe_add + sizeof(struct mlx4_wqe_ctrl_seg);
+	uint32_t srcrb_flags;
+	uint32_t imm = 0;
+	int size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+	int idx = (wr->send_flags & IBV_SEND_SIGNALED)/IBV_SEND_SIGNALED |
+		  (wr->send_flags & IBV_SEND_SOLICITED)/(IBV_SEND_SOLICITED >> 1);
+
+	if (is_xrc)
+		srcrb_flags = htonl((wr->qp_type.xrc.remote_srqn << 8) |
+				    (qp->srcrb_flags_tbl[idx]));
+	else
+		srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+
+	switch (wr->opcode) {
+	case IBV_WR_ATOMIC_CMP_AND_SWP:
+	case IBV_WR_ATOMIC_FETCH_AND_ADD:
+		set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+			      wr->wr.atomic.rkey);
+		wqe  += sizeof(struct mlx4_wqe_raddr_seg);
+
+		set_atomic_seg(wqe, (struct ibv_exp_send_wr *)wr);
+		wqe  += sizeof(struct mlx4_wqe_atomic_seg);
+		size += (sizeof(struct mlx4_wqe_raddr_seg) +
+			 sizeof(struct mlx4_wqe_atomic_seg)) / 16;
+
+		break;
+
+	case IBV_WR_SEND_WITH_IMM:
+		imm = wr->imm_data;
+		break;
+
+	case IBV_WR_RDMA_WRITE_WITH_IMM:
+		imm = wr->imm_data;
+		if (!wr->num_sge)
+			*inl = 1;
+		set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					wr->wr.rdma.rkey);
+		wqe  += sizeof(struct mlx4_wqe_raddr_seg);
+		size += sizeof(struct mlx4_wqe_raddr_seg) / 16;
+		break;
+
+	case IBV_WR_RDMA_READ:
+		*inl = 1;
+		/* fall through */
+	case IBV_WR_RDMA_WRITE:
+		set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					wr->wr.rdma.rkey);
+		wqe  += sizeof(struct mlx4_wqe_raddr_seg);
+		size += sizeof(struct mlx4_wqe_raddr_seg) / 16;
+
+		break;
+
+	case IBV_WR_SEND:
+		break;
+
+	default:
+		/* No extra segments required for sends */
+		break;
+	}
+
+	return set_common_segments(wr, qp, srcrb_flags, imm, wqe, ctrl, size, total_size, inl, ind);
+}
+
+static int post_send_rc_uc(struct ibv_send_wr *wr,
+			   struct mlx4_qp *qp,
+			   void *wqe_add, int *total_size,
+			   int *inl, unsigned int ind)
+{
+	return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 0);
+}
+
+static int post_send_xrc(struct ibv_send_wr *wr,
+			 struct mlx4_qp *qp,
+			 void *wqe_add, int *total_size,
+			 int *inl, unsigned int ind)
+{
+	return post_send_connected(wr, qp, wqe_add, total_size, inl, ind, 1);
+}
+
+void mlx4_update_post_send_one(struct mlx4_qp *qp)
+{
+	switch (qp->qp_type) {
+	case IBV_QPT_XRC_SEND:
+	case IBV_QPT_XRC:
+		qp->post_send_one = post_send_xrc;
+		break;
+	case IBV_QPT_RC:
+	case IBV_QPT_UC:
+		qp->post_send_one = post_send_rc_uc;
+		break;
+	case IBV_QPT_UD:
+		qp->post_send_one = post_send_ud;
+		break;
+
+	case IBV_QPT_RAW_PACKET:
+		qp->post_send_one = post_send_rc_raw_packet;
+		break;
+
+	default:
+		qp->post_send_one = post_send_other;
+		break;
 	}
 }
 
 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
-			  struct ibv_send_wr **bad_wr)
+		     struct ibv_send_wr **bad_wr)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	void *uninitialized_var(ctrl);
+	unsigned int ind;
+	int nreq;
+	int inl = 0;
+	int ret = 0;
+	int size = 0;
+
+	mlx4_lock(&qp->sq.lock);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		/* to be considered whether can throw first check, create_qp_exp with post_send */
+		if (!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW))
+			if (unlikely(wq_overflow(&qp->sq, nreq, qp))) {
+				ret = ENOMEM;
+				errno = ret;
+				*bad_wr = wr;
+				goto out;
+			}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			ret = ENOMEM;
+			errno = ret;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->opcode >= sizeof(mlx4_ib_opcode) / sizeof(mlx4_ib_opcode[0]))) {
+			ret = EINVAL;
+			errno = ret;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+		ret = qp->post_send_one(wr, qp, ctrl, &size, &inl, ind);
+		if (unlikely(ret)) {
+			inl = 0;
+			errno = ret;
+			*bad_wr = wr;
+			goto out;
+		}
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 */
+		if (likely(wr->next))
+#ifndef MLX4_WQE_FORMAT
+			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+				       (qp->sq.wqe_cnt - 1));
+#else
+			/* Make sure all owners bits are set to HW ownership */
+			set_owner_wqe(qp, ind, size,
+				      ((ind & qp->sq.wqe_cnt) ? htonl(WQE_CTRL_OWN) : 0));
+#endif
+
+		++ind;
+	}
+
+out:
+	ring_db(qp, ctrl, nreq, size, inl);
+
+	if (likely(nreq))
+#ifndef MLX4_WQE_FORMAT
+		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+			       (qp->sq.wqe_cnt - 1));
+#else
+		set_owner_wqe(qp, ind - 1, size,
+			      ((ind - 1) & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0));
+#endif
+	mlx4_unlock(&qp->sq.lock);
+
+	return ret;
+}
+
+int mlx4_exp_post_send(struct ibv_qp *ibqp, struct ibv_exp_send_wr *wr,
+		     struct ibv_exp_send_wr **bad_wr)
 {
-	struct mlx4_context *ctx;
 	struct mlx4_qp *qp = to_mqp(ibqp);
 	void *wqe;
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	int ind;
+	void *uninitialized_var(ctrl);
+	union {
+		uint32_t srcrb_flags;
+		uint16_t srcrb_flags16[2];
+	} u;
+	uint32_t imm;
+	int idx;
+	unsigned int ind;
+	int uninitialized_var(owner_bit);
 	int nreq;
 	int inl = 0;
 	int ret = 0;
-	int size;
-	int i;
+	int size = 0;
+	uint32_t mlx4_wr_op;
+	uint64_t exp_send_flags;
 
-	pthread_spin_lock(&qp->sq.lock);
+	mlx4_lock(&qp->sq.lock);
 
 	/* XXX check that state is OK to post send */
 
 	ind = qp->sq.head;
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
-		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
-			ret = -1;
+		exp_send_flags = wr->exp_send_flags;
+
+		if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_SQ_OVERFLOW) &&
+			wq_overflow(&qp->sq, nreq, qp))) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			ret = ENOMEM;
 			*bad_wr = wr;
 			goto out;
 		}
 
-		if (wr->num_sge > qp->sq.max_gs) {
-			ret = -1;
+		if (unlikely(wr->exp_opcode >= sizeof(mlx4_ib_opcode_exp) / sizeof(mlx4_ib_opcode_exp[0]))) {
+			ret = EINVAL;
 			*bad_wr = wr;
 			goto out;
 		}
 
-		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
-			ret = -1;
+		if (((MLX4_IB_OPCODE_GET_CLASS(mlx4_ib_opcode_exp[wr->exp_opcode]) == MLX4_OPCODE_MANAGED) ||
+		      (exp_send_flags & IBV_EXP_SEND_WITH_CALC)) &&
+		     !(qp->create_flags & IBV_EXP_QP_CREATE_CROSS_CHANNEL)) {
+			ret = EINVAL;
 			*bad_wr = wr;
 			goto out;
 		}
 
+		mlx4_wr_op = MLX4_IB_OPCODE_GET_OP(mlx4_ib_opcode_exp[wr->exp_opcode]);
+
 		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
 		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+		owner_bit = ind & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0;
 
-		ctrl->xrcrb_flags =
-			(wr->send_flags & IBV_SEND_SIGNALED ?
-			 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
-			(wr->send_flags & IBV_SEND_SOLICITED ?
-			 htonl(MLX4_WQE_CTRL_SOLICIT) : 0)   |
-			qp->sq_signal_bits;
+		idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED |
+		      (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) |
+		      (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2);
+		u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
 
-		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
-		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
-			ctrl->imm = wr->imm_data;
-		else
-			ctrl->imm = 0;
+		imm = (MLX4_IB_OPCODE_GET_ATTR(mlx4_ib_opcode_exp[wr->exp_opcode]) & MLX4_OPCODE_WITH_IMM ?
+		      wr->ex.imm_data : 0);
 
-		wqe += sizeof *ctrl;
-		size = sizeof *ctrl / 16;
+		wqe += sizeof(struct mlx4_wqe_ctrl_seg);
+		size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
 
-		switch (ibqp->qp_type) {
+		switch (qp->qp_type) {
+		case IBV_QPT_XRC_SEND:
 		case IBV_QPT_XRC:
-			ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
-			/* fall thru */
+			u.srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
+			/* fall through */
 		case IBV_QPT_RC:
 		case IBV_QPT_UC:
-			switch (wr->opcode) {
-			case IBV_WR_ATOMIC_CMP_AND_SWP:
-			case IBV_WR_ATOMIC_FETCH_AND_ADD:
-				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
-					      wr->wr.atomic.rkey);
+			switch (wr->exp_opcode) {
+			case IBV_EXP_WR_ATOMIC_CMP_AND_SWP:
+			case IBV_EXP_WR_ATOMIC_FETCH_AND_ADD:
+			case IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD:
+				if (wr->exp_opcode == IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD) {
+					if (!qp->is_masked_atomic) {
+						ret = EINVAL;
+						*bad_wr = wr;
+						goto out;
+					}
+					set_raddr_seg(wqe,
+						      wr->ext_op.masked_atomics.remote_addr,
+						      wr->ext_op.masked_atomics.rkey);
+				} else {
+					set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+						      wr->wr.atomic.rkey);
+				}
 				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
 
 				set_atomic_seg(wqe, wr);
@@ -264,184 +1239,259 @@
 
 				break;
 
-			case IBV_WR_RDMA_READ:
+			case IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP:
+				if (!qp->is_masked_atomic) {
+					ret = EINVAL;
+					*bad_wr = wr;
+					goto out;
+				}
+				set_raddr_seg(wqe,
+					      wr->ext_op.masked_atomics.remote_addr,
+					      wr->ext_op.masked_atomics.rkey);
+				wqe += sizeof(struct mlx4_wqe_raddr_seg);
+
+				set_masked_atomic_seg(wqe, wr);
+				wqe  += sizeof(struct mlx4_wqe_masked_atomic_seg);
+				size += (sizeof(struct mlx4_wqe_raddr_seg) +
+					 sizeof(struct mlx4_wqe_masked_atomic_seg)) / 16;
+				break;
+
+			case IBV_EXP_WR_RDMA_READ:
 				inl = 1;
 				/* fall through */
-			case IBV_WR_RDMA_WRITE:
-			case IBV_WR_RDMA_WRITE_WITH_IMM:
-				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
-					      wr->wr.rdma.rkey);
+			case IBV_EXP_WR_RDMA_WRITE_WITH_IMM:
+				if (!wr->num_sge)
+					inl = 1;
+				/* fall through */
+			case IBV_EXP_WR_RDMA_WRITE:
+				if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) {
+
+					if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER ||
+					    (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER ||
+					    (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER ||
+					    !mlx4_calc_ops_table
+						[wr->op.calc.data_size]
+							[wr->op.calc.calc_op]
+								[wr->op.calc.data_type].valid) {
+						ret = -1;
+						*bad_wr = wr;
+						goto out;
+					}
+
+					mlx4_wr_op = MLX4_OPCODE_CALC_RDMA_WRITE_IMM |
+							mlx4_calc_ops_table
+								[wr->op.calc.data_size]
+									[wr->op.calc.calc_op]
+										[wr->op.calc.data_type].opcode;
+					set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+								wr->wr.rdma.rkey);
+
+				} else {
+					set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+							wr->wr.rdma.rkey);
+				}
 				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
 				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
 
 				break;
 
-			default:
-				/* No extra segments required for sends */
+			case IBV_EXP_WR_LOCAL_INV:
+				u.srcrb_flags |= htonl(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+				wqe  += sizeof
+					(struct mlx4_wqe_local_inval_seg);
+				size += sizeof
+					(struct mlx4_wqe_local_inval_seg) / 16;
 				break;
-			}
-			break;
-
-		case IBV_QPT_UD:
-			set_datagram_seg(wqe, wr);
-			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
-			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
-			if (to_mah(wr->wr.ud.ah)->tagged) {
-				ctrl->ins_vlan = 1 << 6;
-				ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan);
-			}
 
-			break;
+			case IBV_EXP_WR_BIND_MW:
+				u.srcrb_flags |= htonl(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_bind_seg(wqe, wr);
+				wqe  += sizeof
+					(struct mlx4_wqe_bind_seg);
+				size += sizeof
+					(struct mlx4_wqe_bind_seg) / 16;
+				break;
 
-		default:
-			break;
-		}
+			case IBV_EXP_WR_SEND:
+				if (exp_send_flags & IBV_EXP_SEND_WITH_CALC) {
+
+					if ((uint32_t)wr->op.calc.data_size >= IBV_EXP_CALC_DATA_SIZE_NUMBER ||
+					    (uint32_t)wr->op.calc.calc_op >= IBV_EXP_CALC_OP_NUMBER ||
+					    (uint32_t)wr->op.calc.data_type >= IBV_EXP_CALC_DATA_TYPE_NUMBER ||
+					    !mlx4_calc_ops_table
+						[wr->op.calc.data_size]
+							[wr->op.calc.calc_op]
+								[wr->op.calc.data_type].valid) {
+						ret = -1;
+						*bad_wr = wr;
+						goto out;
+					}
+
+					mlx4_wr_op = MLX4_OPCODE_CALC_SEND |
+							mlx4_calc_ops_table
+								[wr->op.calc.data_size]
+									[wr->op.calc.calc_op]
+										[wr->op.calc.data_type].opcode;
+				}
 
-		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
-			struct mlx4_wqe_inline_seg *seg;
-			void *addr;
-			int len, seg_len;
-			int num_seg;
-			int off, to_copy;
+				break;
 
-			inl = 0;
+			case IBV_EXP_WR_CQE_WAIT:
+				{
+					struct mlx4_cq *wait_cq = to_mcq(wr->task.cqe_wait.cq);
+					uint32_t wait_index = 0;
 
-			seg = wqe;
-			wqe += sizeof *seg;
-			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
-			num_seg = 0;
-			seg_len = 0;
+					wait_index = wait_cq->wait_index +
+								wr->task.cqe_wait.cq_count;
+					wait_cq->wait_count = max(wait_cq->wait_count,
+								wr->task.cqe_wait.cq_count);
 
-			for (i = 0; i < wr->num_sge; ++i) {
-				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
-				len  = wr->sg_list[i].length;
-				inl += len;
+					if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) {
+						wait_cq->wait_index += wait_cq->wait_count;
+						wait_cq->wait_count = 0;
+					}
 
-				if (inl > qp->max_inline_data) {
-					inl = 0;
-					ret = -1;
-					*bad_wr = wr;
-					goto out;
+					set_wait_en_seg(wqe, wait_cq->cqn, wait_index);
+					wqe   += sizeof(struct mlx4_wqe_wait_en_seg);
+					size += sizeof(struct mlx4_wqe_wait_en_seg) / 16;
 				}
+				break;
 
-				while (len >= MLX4_INLINE_ALIGN - off) {
-					to_copy = MLX4_INLINE_ALIGN - off;
-					memcpy(wqe, addr, to_copy);
-					len -= to_copy;
-					wqe += to_copy;
-					addr += to_copy;
-					seg_len += to_copy;
-					wmb(); /* see comment below */
-					seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
-					seg_len = 0;
-					seg = wqe;
-					wqe += sizeof *seg;
-					off = sizeof *seg;
-					++num_seg;
+			case IBV_EXP_WR_SEND_ENABLE:
+			case IBV_EXP_WR_RECV_ENABLE:
+				{
+					unsigned head_en_index;
+					struct mlx4_wq *wq;
+
+					/*
+					 * Posting work request for QP that does not support
+					 * SEND/RECV ENABLE makes performance worse.
+					 */
+					if (((wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) &&
+					     !(to_mqp(wr->task.wqe_enable.qp)->create_flags &
+							     IBV_EXP_QP_CREATE_MANAGED_SEND)) ||
+					     ((wr->exp_opcode == IBV_EXP_WR_RECV_ENABLE) &&
+					     !(to_mqp(wr->task.wqe_enable.qp)->create_flags &
+							     IBV_EXP_QP_CREATE_MANAGED_RECV))) {
+						ret = -1;
+						*bad_wr = wr;
+						goto out;
+					}
+
+					wq = (wr->exp_opcode == IBV_EXP_WR_SEND_ENABLE) ?
+						&to_mqp(wr->task.wqe_enable.qp)->sq :
+						&to_mqp(wr->task.wqe_enable.qp)->rq;
+
+					/* If wqe_count is 0 release all WRs from queue */
+					if (wr->task.wqe_enable.wqe_count) {
+						head_en_index = wq->head_en_index +
+								wr->task.wqe_enable.wqe_count;
+						wq->head_en_count = max(wq->head_en_count,
+								wr->task.wqe_enable.wqe_count);
+
+						if ((int)(wq->head - head_en_index) < 0) {
+							ret = -1;
+							*bad_wr = wr;
+							goto out;
+						}
+					} else {
+						head_en_index = wq->head;
+						wq->head_en_count = wq->head - wq->head_en_index;
+					}
+
+					if (exp_send_flags & IBV_EXP_SEND_WAIT_EN_LAST) {
+						wq->head_en_index += wq->head_en_count;
+						wq->head_en_count = 0;
+					}
+
+					set_wait_en_seg(wqe,
+							wr->task.wqe_enable.qp->qp_num,
+							head_en_index);
+
+					wqe += sizeof(struct mlx4_wqe_wait_en_seg);
+					size += sizeof(struct mlx4_wqe_wait_en_seg) / 16;
 				}
+				break;
 
-				memcpy(wqe, addr, len);
-				wqe += len;
-				seg_len += len;
-				off += len;
-			}
+			case IBV_EXP_WR_SEND_WITH_INV:
+				imm = htonl(wr->ex.invalidate_rkey);
+				break;
 
-			if (seg_len) {
-				++num_seg;
-				/*
-				 * Need a barrier here to make sure
-				 * all the data is visible before the
-				 * byte_count field is set.  Otherwise
-				 * the HCA prefetcher could grab the
-				 * 64-byte chunk with this inline
-				 * segment and get a valid (!=
-				 * 0xffffffff) byte count but stale
-				 * data, and end up sending the wrong
-				 * data.
-				 */
-				wmb();
-				seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+			default:
+				/* No extra segments required for sends */
+				break;
 			}
+			break;
 
-			size += (inl + num_seg * sizeof * seg + 15) / 16;
-		} else {
-			struct mlx4_wqe_data_seg *seg = wqe;
+		case IBV_QPT_UD:
+			set_datagram_seg(wqe, (struct ibv_send_wr *)wr);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			break;
 
-			for (i = wr->num_sge - 1; i >= 0 ; --i)
-				set_data_seg(seg + i, wr->sg_list + i);
+		case IBV_QPT_RAW_PACKET:
+			/* Sanity check - prevent from posting empty SR */
+			if (unlikely(!wr->num_sge)) {
+				ret = EINVAL;
+				*bad_wr = wr;
+				goto out;
+			}
+			if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) {
+				/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+				* to indicate that no icrc should be calculated */
+				u.srcrb_flags |= htonl(MLX4_WQE_CTRL_SOLICIT);
+				/* For raw eth, take the dmac from the payload */
+				u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)wr->sg_list[0].addr;
+				imm = *(uint32_t *)((uintptr_t)(wr->sg_list[0].addr)+2);
+			}
+			break;
 
-			size += wr->num_sge * (sizeof *seg / 16);
+		default:
+			break;
 		}
 
-		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
-				    MLX4_WQE_CTRL_FENCE : 0) | size;
-
-		/*
-		 * Make sure descriptor is fully written before
-		 * setting ownership bit (because HW can start
-		 * executing as soon as we do).
-		 */
-		wmb();
-
-		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
-			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
+		ret = set_data_seg(qp, wqe, &size, !!(exp_send_flags & IBV_EXP_SEND_INLINE),
+				   wr->num_sge, wr->sg_list, &inl, owner_bit);
+		if (unlikely(ret)) {
+			inl = 0;
+			*bad_wr = wr;
+			goto out;
+		}
 
+		set_ctrl_seg(ctrl, (struct ibv_send_wr *)wr, qp, imm, u.srcrb_flags, owner_bit, size, mlx4_wr_op);
 		/*
 		 * We can improve latency by not stamping the last
 		 * send queue WQE until after ringing the doorbell, so
 		 * only stamp here if there are still more WQEs to post.
 		 */
-		if (wr->next)
+		if (likely(wr->next))
+#ifndef MLX4_WQE_FORMAT
 			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
 				       (qp->sq.wqe_cnt - 1));
-
+#else
+			set_owner_wqe(qp, ind, size, owner_bit);
+#endif
 		++ind;
 	}
 
 out:
-	ctx = to_mctx(ibqp->context);
-
-	if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
-		ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
-		*(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
-		/*
-		 * Make sure that descriptor is written to memory
-		 * before writing to BlueFlame page.
-		 */
-		wmb();
-
-		++qp->sq.head;
-
-		pthread_spin_lock(&ctx->bf_lock);
-
-		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
-			     align(size * 16, 64));
-		wc_wmb();
-
-		ctx->bf_offset ^= ctx->bf_buf_size;
-
-		pthread_spin_unlock(&ctx->bf_lock);
-	} else if (nreq) {
-		qp->sq.head += nreq;
-
-		/*
-		 * Make sure that descriptors are written before
-		 * doorbell record.
-		 */
-		wmb();
-
-		*(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
-	}
-
-	if (nreq)
+	ring_db(qp, ctrl, nreq, size, inl);
+	if (likely(nreq))
+#ifndef MLX4_WQE_FORMAT
 		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
 			       (qp->sq.wqe_cnt - 1));
+#else
+		set_owner_wqe(qp, ind - 1, size, owner_bit);
+#endif
 
-	pthread_spin_unlock(&qp->sq.lock);
+	mlx4_unlock(&qp->sq.lock);
 
 	return ret;
 }
 
+
+
 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 		   struct ibv_recv_wr **bad_wr)
 {
@@ -449,24 +1499,25 @@
 	struct mlx4_wqe_data_seg *scat;
 	int ret = 0;
 	int nreq;
-	int ind;
+	unsigned int ind;
 	int i;
+	struct mlx4_inlr_rbuff *rbuffs;
 
-	pthread_spin_lock(&qp->rq.lock);
+	mlx4_lock(&qp->rq.lock);
 
 	/* XXX check that state is OK to post receive */
-
 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
-		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
-			ret = -1;
+		if (unlikely(!(qp->create_flags & IBV_EXP_QP_CREATE_IGNORE_RQ_OVERFLOW) &&
+			wq_overflow(&qp->rq, nreq, qp))) {
+			ret = ENOMEM;
 			*bad_wr = wr;
 			goto out;
 		}
 
-		if (wr->num_sge > qp->rq.max_gs) {
-			ret = -1;
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			ret = EINVAL;
 			*bad_wr = wr;
 			goto out;
 		}
@@ -476,11 +1527,20 @@
 		for (i = 0; i < wr->num_sge; ++i)
 			__set_data_seg(scat + i, wr->sg_list + i);
 
-		if (i < qp->rq.max_gs) {
+		if (likely(i < qp->rq.max_gs)) {
 			scat[i].byte_count = 0;
 			scat[i].lkey       = htonl(MLX4_INVALID_LKEY);
 			scat[i].addr       = 0;
 		}
+		if (qp->max_inlr_sg) {
+			rbuffs = qp->inlr_buff.buff[ind].sg_list;
+			qp->inlr_buff.buff[ind].list_len = wr->num_sge;
+			for (i = 0; i < wr->num_sge; ++i) {
+				rbuffs->rbuff = (void *)(unsigned long)(wr->sg_list[i].addr);
+				rbuffs->rlen = wr->sg_list[i].length;
+				rbuffs++;
+			}
+		}
 
 		qp->rq.wrid[ind] = wr->wr_id;
 
@@ -488,7 +1548,7 @@
 	}
 
 out:
-	if (nreq) {
+	if (likely(nreq)) {
 		qp->rq.head += nreq;
 
 		/*
@@ -500,7 +1560,7 @@
 		*qp->db = htonl(qp->rq.head & 0xffff);
 	}
 
-	pthread_spin_unlock(&qp->rq.lock);
+	mlx4_unlock(&qp->rq.lock);
 
 	return ret;
 }
@@ -533,6 +1593,7 @@
 			   struct mlx4_qp *qp)
 {
 	int size;
+	int atomic_size;
 	int max_sq_sge;
 
 	max_sq_sge	 = align(cap->max_inline_data +
@@ -553,6 +1614,7 @@
 		size += sizeof (struct mlx4_wqe_raddr_seg);
 		break;
 
+	case IBV_QPT_XRC_SEND:
 	case IBV_QPT_XRC:
 	case IBV_QPT_RC:
 		size += sizeof (struct mlx4_wqe_raddr_seg);
@@ -560,12 +1622,14 @@
 		 * An atomic op will require an atomic segment, a
 		 * remote address segment and one scatter entry.
 		 */
-		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
-			    sizeof (struct mlx4_wqe_raddr_seg) +
-			    sizeof (struct mlx4_wqe_data_seg)))
-			size = (sizeof (struct mlx4_wqe_atomic_seg) +
-				sizeof (struct mlx4_wqe_raddr_seg) +
-				sizeof (struct mlx4_wqe_data_seg));
+		atomic_size = (qp->is_masked_atomic ?
+			       sizeof(struct mlx4_wqe_masked_atomic_seg) :
+			       sizeof(struct mlx4_wqe_atomic_seg)) +
+			      sizeof(struct mlx4_wqe_raddr_seg) +
+			      sizeof(struct mlx4_wqe_data_seg);
+
+		if (size < atomic_size)
+			size = atomic_size;
 		break;
 
 	default:
@@ -583,56 +1647,39 @@
 		; /* nothing */
 }
 
-int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
-		       enum ibv_qp_type type, struct mlx4_qp *qp)
+int mlx4_use_huge(struct ibv_context *context, const char *key)
 {
-	qp->rq.max_gs	 = cap->max_recv_sge;
+	char e[VERBS_MAX_ENV_VAL];
 
-	qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
-	if (!qp->sq.wrid)
-		return -1;
+	if (!ibv_exp_cmd_getenv(context, key, e, sizeof(e)) && !strcmp(e, "y"))
+		return 1;
 
+	return 0;
+}
+
+void mlx4_dealloc_qp_buf(struct ibv_context *context, struct mlx4_qp *qp)
+{
 	if (qp->rq.wqe_cnt) {
-		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
-		if (!qp->rq.wrid) {
-			free(qp->sq.wrid);
-			return -1;
+		free(qp->rq.wrid);
+		if (qp->max_inlr_sg) {
+			free(qp->inlr_buff.buff[0].sg_list);
+			free(qp->inlr_buff.buff);
 		}
 	}
-
-	for (qp->rq.wqe_shift = 4;
-	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
-	     qp->rq.wqe_shift++)
-		; /* nothing */
-
-	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
-		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
-	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
-		qp->rq.offset = 0;
-		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
-	} else {
-		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
-		qp->sq.offset = 0;
-	}
-
-	if (mlx4_alloc_buf(&qp->buf,
-			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
-			    to_mdev(pd->context->device)->page_size)) {
+	if (qp->sq.wqe_cnt)
 		free(qp->sq.wrid);
-		free(qp->rq.wrid);
-		return -1;
-	}
 
-	memset(qp->buf.buf, 0, qp->buf_size);
-
-	return 0;
+	if (qp->buf.hmem != NULL)
+		mlx4_free_buf_huge(to_mctx(context), &qp->buf);
+	else
+		mlx4_free_buf(&qp->buf);
 }
 
 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
 		       enum ibv_qp_type type)
 {
 	int wqe_size;
-	struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context);
+	struct mlx4_context *ctx = to_mctx(qp->verbs_qp.qp.context);
 
 	wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) -
 		sizeof (struct mlx4_wqe_ctrl_seg);
@@ -641,9 +1688,10 @@
 		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
 		break;
 
+	case IBV_QPT_XRC_SEND:
+	case IBV_QPT_XRC:
 	case IBV_QPT_UC:
 	case IBV_QPT_RC:
-	case IBV_QPT_XRC:
 		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
 		break;
 
@@ -704,3 +1752,812 @@
 	else
 		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
 }
+
+int mlx4_post_task(struct ibv_context *context,
+		   struct ibv_exp_task *task_list,
+		   struct ibv_exp_task **bad_task)
+{
+	int rc = 0;
+	struct ibv_exp_task *cur_task = NULL;
+	struct ibv_exp_send_wr  *bad_wr;
+	struct mlx4_context *mlx4_ctx = to_mctx(context);
+
+	if (!task_list)
+		return rc;
+
+	pthread_mutex_lock(&mlx4_ctx->task_mutex);
+
+	cur_task = task_list;
+	while (!rc && cur_task) {
+
+		switch (cur_task->task_type) {
+		case IBV_EXP_TASK_SEND:
+			rc = ibv_exp_post_send(cur_task->item.qp,
+					       cur_task->item.send_wr,
+					       &bad_wr);
+			break;
+
+		case IBV_EXP_TASK_RECV:
+			rc = ibv_post_recv(cur_task->item.qp,
+					cur_task->item.recv_wr,
+					NULL);
+			break;
+
+		default:
+			rc = -1;
+		}
+
+		if (rc && bad_task) {
+			*bad_task = cur_task;
+			break;
+		}
+
+		cur_task = cur_task->next;
+	}
+
+	pthread_mutex_unlock(&mlx4_ctx->task_mutex);
+
+	return rc;
+}
+
+/*
+ * family interfaces functions
+ */
+
+/*
+ * send_pending - is a general post send function that put one message in
+ * the send queue. The function is not ringing the QP door-bell.
+ *
+ * User may call this function several times to fill send queue with
+ * several messages, then he can call mlx4_send_flush to ring the QP DB
+ *
+ * This function is used to implement the following QP burst family functions:
+ * - send_pending
+ * - send_pending_inline
+ * - send_pending_sg_list
+ * - send_burst
+ */
+static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr,
+			       uint32_t length, uint32_t lkey,
+			       uint32_t flags,
+			       const int use_raw_eth, const int use_inl,
+			       const int thread_safe, const int wqe_64,
+			       const int use_sg_list, int num_sge,
+			       struct ibv_sge *sg_list,
+			       const int lb) __attribute__((always_inline));
+static inline int send_pending(struct ibv_qp *ibqp, uint64_t addr,
+			       uint32_t length, uint32_t lkey,
+			       uint32_t flags,
+			       const int use_raw_eth, const int use_inl,
+			       const int thread_safe, const int wqe_64,
+			       const int use_sg_list, int num_sge,
+			       struct ibv_sge *sg_list,
+			       const int lb)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	uint32_t tunnel_offload = 0;
+	unsigned int owner_bit = qp->sq.head & qp->sq.wqe_cnt ? htonl(WQE_CTRL_OWN) : 0;
+	int size;
+	int idx;
+	int i;
+
+	if (thread_safe)
+		mlx4_lock(&qp->sq.lock);
+
+	if (wqe_64)
+		ctrl = get_send_wqe64(qp, qp->sq.head & (qp->sq.wqe_cnt - 1));
+	else
+		ctrl = get_send_wqe(qp, qp->sq.head & (qp->sq.wqe_cnt - 1));
+
+	dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) + sizeof(struct mlx4_wqe_ctrl_seg));
+
+	if (use_sg_list) {
+		for (i =  num_sge - 1; i >= 0 ; --i)
+				set_ptr_data(dseg + i, sg_list + i, owner_bit);
+
+		size = (sizeof(struct mlx4_wqe_ctrl_seg) +  (num_sge * sizeof(struct mlx4_wqe_data_seg)))/ 16;
+	} else {
+		if (use_inl) {
+			size = sizeof(struct mlx4_wqe_ctrl_seg) / 16;
+			set_data_inl_seg_fast(qp, (void *)(uintptr_t)addr, length, dseg, &size, owner_bit);
+		} else {
+			size = (sizeof(struct mlx4_wqe_ctrl_seg) +  sizeof(struct mlx4_wqe_data_seg))/ 16;
+			dseg->byte_count = SET_BYTE_COUNT(length);
+			dseg->lkey = htonl(lkey);
+			dseg->addr = htonll(addr);
+		}
+	}
+
+	if (use_raw_eth) {
+		/* For raw eth, the SOLICIT flag is used
+		* to indicate that no icrc should be calculated */
+		idx = IBV_EXP_QP_BURST_SOLICITED |
+		      (flags & (IBV_EXP_QP_BURST_SIGNALED |
+				IBV_EXP_QP_BURST_IP_CSUM |
+				IBV_EXP_QP_BURST_TUNNEL));
+		tunnel_offload = flags & IBV_EXP_QP_BURST_TUNNEL ? MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_IL4 : 0;
+	} else {
+		idx = (flags & (IBV_EXP_QP_BURST_SIGNALED |
+				IBV_EXP_QP_BURST_SOLICITED |
+				IBV_EXP_QP_BURST_IP_CSUM));
+	}
+
+	if (use_raw_eth && lb) {
+		union {
+			uint32_t srcrb_flags;
+			uint16_t srcrb_flags16[2];
+		} u;
+
+		u.srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+		/* For raw eth, take the dmac from the payload */
+		if (use_sg_list)
+			addr = sg_list[0].addr;
+		u.srcrb_flags16[0] = *(uint16_t *)(uintptr_t)addr;
+		ctrl->srcrb_flags = u.srcrb_flags;
+		ctrl->imm = *(uint32_t *)((uintptr_t)(addr)+2);
+	} else {
+		ctrl->srcrb_flags = htonl((uint32_t)qp->srcrb_flags_tbl[idx]);
+		ctrl->imm = 0;
+	}
+	ctrl->fence_size = (flags & IBV_EXP_QP_BURST_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size;
+
+	/*
+	 * Make sure descriptor is fully written before
+	 * setting ownership bit (because HW can start
+	 * executing as soon as we do).
+	 */
+	wmb();
+
+	ctrl->owner_opcode = htonl(MLX4_OPCODE_SEND | tunnel_offload) | owner_bit;
+	qp->sq.head++;
+
+	if (!wqe_64)
+#ifndef MLX4_WQE_FORMAT
+		stamp_send_wqe(qp, (qp->sq.head + qp->sq_spare_wqes) &
+			       (qp->sq.wqe_cnt - 1));
+#else
+		set_owner_wqe(qp, qp->sq.head, size, owner_bit);
+#endif
+	if (thread_safe)
+		mlx4_unlock(&qp->sq.lock);
+	else
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+	return 0;
+}
+
+/* burst family - send_pending */
+static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr,
+					 uint32_t length, uint32_t lkey,
+					 uint32_t flags, const int lb) __attribute__((always_inline));
+static inline int mlx4_send_pending_safe(struct ibv_qp *qp, uint64_t addr,
+					 uint32_t length, uint32_t lkey,
+					 uint32_t flags, const int lb)
+{
+	struct mlx4_qp *mqp = to_mqp(qp);
+	int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET &&
+		      mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+	int wqe_64 = mqp->sq.wqe_shift == 6;
+
+			/*  qp, addr, length, lkey, flags, raw_eth, inl, safe,	*/
+	return send_pending(qp, addr, length, lkey, flags, raw_eth, 0,   1,
+			/*  wqe_64, use_sg, num_sge, sg_list, lb		*/
+			    wqe_64, 0,      0,       NULL,    lb);
+}
+
+static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_safe_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags)
+{
+	return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 1);
+}
+
+static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_safe_no_lb(struct ibv_qp *qp, uint64_t addr, uint32_t length, uint32_t lkey, uint32_t flags)
+{
+	return mlx4_send_pending_safe(qp, addr, length, lkey, flags, 0);
+}
+
+#define MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_PENDING_UNSAFE(eth, wqe64, lb)				\
+	static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)(		\
+					struct ibv_qp *qp, uint64_t addr,	\
+					uint32_t length, uint32_t lkey,		\
+					uint32_t flags) __MLX4_ALGN_FUNC__;	\
+	static int MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb)(		\
+					struct ibv_qp *qp, uint64_t addr,	\
+					uint32_t length, uint32_t lkey,		\
+					uint32_t flags)				\
+	{									\
+		/*                  qp, addr, length, lkey, flags, eth, inl, */	\
+		return send_pending(qp, addr, length, lkey, flags, eth, 0,	\
+				/*  safe,  wqe_64, use_sg, num_sge, sg_list  */	\
+				    0,     wqe64,  0,      0,       NULL,	\
+				/*  lb					     */ \
+				    lb);					\
+	}
+/*			 eth, wqe64, lb */
+MLX4_SEND_PENDING_UNSAFE(0,   0,     0);
+MLX4_SEND_PENDING_UNSAFE(0,   0,     1);
+MLX4_SEND_PENDING_UNSAFE(0,   1,     0);
+MLX4_SEND_PENDING_UNSAFE(0,   1,     1);
+MLX4_SEND_PENDING_UNSAFE(1,   0,     0);
+MLX4_SEND_PENDING_UNSAFE(1,   0,     1);
+MLX4_SEND_PENDING_UNSAFE(1,   1,     0);
+MLX4_SEND_PENDING_UNSAFE(1,   1,     1);
+
+/* burst family - send_pending_inline */
+static inline int mlx4_send_pending_inl_safe(struct ibv_qp *qp, void *addr,
+					     uint32_t length, uint32_t flags,
+					     const int lb) __attribute__((always_inline));
+static inline int mlx4_send_pending_inl_safe(struct ibv_qp *qp, void *addr,
+					     uint32_t length, uint32_t flags,
+					     const int lb)
+{
+	struct mlx4_qp *mqp = to_mqp(qp);
+	int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+	int wqe_64 = mqp->sq.wqe_shift == 6;
+
+			/*  qp, addr,            length, lkey, flags, raw_eth,	*/
+	return send_pending(qp, (uintptr_t)addr, length, 0,    flags, raw_eth,
+			/*  inl, safe,  wqe_64, use_sg, num_sge, sg_list, lb	*/
+			    1,   1,     wqe_64, 0,      0,       NULL,    lb);
+}
+
+static int mlx4_send_pending_inl_safe_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_inl_safe_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags)
+{
+	return mlx4_send_pending_inl_safe(qp, addr, length, flags, 1);
+}
+
+static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_inl_safe_no_lb(struct ibv_qp *qp, void *addr, uint32_t length, uint32_t flags)
+{
+	return mlx4_send_pending_inl_safe(qp, addr, length, flags, 0);
+}
+
+#define MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_inl_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_PENDING_INL_UNSAFE(eth, wqe64, lb)						\
+	static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)(				\
+					struct ibv_qp *qp, void *addr,				\
+					uint32_t length, uint32_t flags) __MLX4_ALGN_FUNC__;	\
+	static int MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb)(				\
+					struct ibv_qp *qp, void *addr,				\
+					uint32_t length, uint32_t flags)			\
+	{											\
+		/*                  qp, addr,            length, lkey, flags, eth, inl, */	\
+		return send_pending(qp, (uintptr_t)addr, length, 0,    flags, eth, 1,		\
+				/*  safe,  wqe_64, use_sg, num_sge, sg_list, lb */		\
+				    0,     wqe64,  0,      0,       NULL,    lb);		\
+	}
+/*			   eth, wqe64, lb */
+MLX4_SEND_PENDING_INL_UNSAFE(0,   0,   0);
+MLX4_SEND_PENDING_INL_UNSAFE(0,   0,   1);
+MLX4_SEND_PENDING_INL_UNSAFE(0,   1,   0);
+MLX4_SEND_PENDING_INL_UNSAFE(0,   1,   1);
+MLX4_SEND_PENDING_INL_UNSAFE(1,   0,   0);
+MLX4_SEND_PENDING_INL_UNSAFE(1,   0,   1);
+MLX4_SEND_PENDING_INL_UNSAFE(1,   1,   0);
+MLX4_SEND_PENDING_INL_UNSAFE(1,   1,   1);
+
+/* burst family - send_pending_sg_list */
+static inline int mlx4_send_pending_sg_list_safe(
+		struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+		uint32_t flags, const int lb) __attribute__((always_inline));
+static inline int mlx4_send_pending_sg_list_safe(
+		struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+		uint32_t flags, const int lb)
+{
+	struct mlx4_qp *mqp = to_mqp(ibqp);
+	int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+	int wqe_64 = mqp->sq.wqe_shift == 6;
+
+			/*  qp,   addr, length, lkey, flags, raw_eth, inl,	*/
+	return send_pending(ibqp, 0,    0,      0,    flags, raw_eth, 0,
+			/*  safe,  wqe_64, use_sg, num_sge, sg_list, lb */
+			    1,     wqe_64, 1,      num,     sg_list, lb);
+}
+static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_sg_list_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+	return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 1);
+}
+
+static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_pending_sg_list_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+	return mlx4_send_pending_sg_list_safe(ibqp, sg_list, num, flags, 0);
+}
+
+#define MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb) mlx4_send_pending_sg_list_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_PENDING_SG_LIST_UNSAFE(eth, wqe64, lb)					\
+	static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)(			\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,		\
+					uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;	\
+	static int MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb)(			\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,		\
+					uint32_t num, uint32_t flags)				\
+	{											\
+				/*  qp,   addr, length, lkey, flags, eth, inl, */			\
+		return send_pending(ibqp, 0,    0,      0,    flags, eth, 0,			\
+				/*  safe,  wqe_64, use_sg, num_sge, sg_list,  lb */		\
+				    0,     wqe64,  1,      num,       sg_list, lb);		\
+	}
+/*			         eth, wqe64, lb */
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0,     0,   0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0,     0,   1);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0,     1,   0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(0,     1,   1);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1,     0,   0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1,     0,   1);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1,     1,   0);
+MLX4_SEND_PENDING_SG_LIST_UNSAFE(1,     1,   1);
+
+static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64) __attribute__((always_inline));
+/* burst family - send_burst */
+static inline int send_msg_list(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+				uint32_t flags, const int raw_eth, const int thread_safe,
+				const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb) __attribute__((always_inline));
+static inline int send_msg_list(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+				uint32_t flags, const int raw_eth, const int thread_safe,
+				const int wqe_64, const int use_bf, const int _1thrd_evict, const int lb)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	int i;
+
+	if (unlikely(thread_safe))
+		mlx4_lock(&qp->sq.lock);
+
+	for (i = 0; i < num; i++, sg_list++)
+			/*   qp,   addr,          length,          lkey,	*/
+		send_pending(ibqp, sg_list->addr, sg_list->length, sg_list->lkey,
+			/*   flags, raw_eth, inl, safe,  wqe_64, use_sg,	*/
+			     flags, raw_eth, 0,   0,     wqe_64, 0,
+			/*   num_sge, sg_list, lb				*/
+			     0,       NULL,    lb);
+
+	if (use_bf)
+		/* use send_flush_unsafe since lock is already taken if needed */
+		send_flush_unsafe(ibqp, _1thrd_evict, wqe_64);
+	else
+		*qp->sdb = qp->doorbell_qpn;
+
+	if (unlikely(thread_safe))
+		mlx4_unlock(&qp->sq.lock);
+
+	return 0;
+}
+
+static inline int mlx4_send_burst_safe(
+		struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+		uint32_t flags, const int lb) __attribute__((always_inline));
+static inline int mlx4_send_burst_safe(
+		struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+		uint32_t flags, const int lb)
+{
+	struct mlx4_qp *mqp = to_mqp(ibqp);
+	int raw_eth = mqp->qp_type == IBV_QPT_RAW_PACKET && mqp->link_layer == IBV_LINK_LAYER_ETHERNET;
+	int wqe_64 = mqp->sq.wqe_shift == 6;
+	int _1thrd_evict = mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB ||
+			   mqp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+	int use_bf = mqp->db_method != MLX4_QP_DB_METHOD_DB;
+
+	return send_msg_list(ibqp, sg_list, num, flags, raw_eth, 1, wqe_64, use_bf, _1thrd_evict, lb);
+}
+
+static int mlx4_send_burst_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_burst_safe_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+	return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 1);
+}
+
+static int mlx4_send_burst_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;
+static int mlx4_send_burst_safe_no_lb(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num, uint32_t flags)
+{
+	return mlx4_send_burst_safe(ibqp, sg_list, num, flags, 0);
+}
+
+#define MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb) mlx4_send_burst_unsafe_##_1thrd_evict##eth##wqe64##lb
+#define MLX4_SEND_BURST_UNSAFE(_1thrd_evict, eth, wqe64, lb)					\
+	static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)(			\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,		\
+					uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;	\
+	static int MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb)(			\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,		\
+					uint32_t num, uint32_t flags)				\
+	{											\
+		return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 1, _1thrd_evict,	\
+				     lb);							\
+	}
+/*	     _1thrd_evict, eth, wqe64, lb */
+MLX4_SEND_BURST_UNSAFE(0,   0,   0,    0);
+MLX4_SEND_BURST_UNSAFE(0,   0,   0,    1);
+MLX4_SEND_BURST_UNSAFE(0,   0,   1,    0);
+MLX4_SEND_BURST_UNSAFE(0,   0,   1,    1);
+MLX4_SEND_BURST_UNSAFE(0,   1,   0,    0);
+MLX4_SEND_BURST_UNSAFE(0,   1,   0,    1);
+MLX4_SEND_BURST_UNSAFE(0,   1,   1,    0);
+MLX4_SEND_BURST_UNSAFE(0,   1,   1,    1);
+MLX4_SEND_BURST_UNSAFE(1,   0,   0,    0);
+MLX4_SEND_BURST_UNSAFE(1,   0,   0,    1);
+MLX4_SEND_BURST_UNSAFE(1,   0,   1,    0);
+MLX4_SEND_BURST_UNSAFE(1,   0,   1,    1);
+MLX4_SEND_BURST_UNSAFE(1,   1,   0,    0);
+MLX4_SEND_BURST_UNSAFE(1,   1,   0,    1);
+MLX4_SEND_BURST_UNSAFE(1,   1,   1,    0);
+MLX4_SEND_BURST_UNSAFE(1,   1,   1,    1);
+
+#define MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb) mlx4_send_burst_unsafe_##eth##wqe64##lb
+#define MLX4_SEND_BURST_UNSAFE_DB(eth, wqe64, lb)						\
+	static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)(				\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,		\
+					uint32_t num, uint32_t flags) __MLX4_ALGN_FUNC__;	\
+	static int MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb)(				\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,		\
+					uint32_t num, uint32_t flags)				\
+	{											\
+		return send_msg_list(ibqp, sg_list, num, flags, eth, 0, wqe64, 0, 0, lb);	\
+	}
+/*	                    eth, wqe64, lb */
+MLX4_SEND_BURST_UNSAFE_DB(0,   0,    0);
+MLX4_SEND_BURST_UNSAFE_DB(0,   0,    1);
+MLX4_SEND_BURST_UNSAFE_DB(0,   1,    0);
+MLX4_SEND_BURST_UNSAFE_DB(0,   1,    1);
+MLX4_SEND_BURST_UNSAFE_DB(1,   0,    0);
+MLX4_SEND_BURST_UNSAFE_DB(1,   0,    1);
+MLX4_SEND_BURST_UNSAFE_DB(1,   1,    0);
+MLX4_SEND_BURST_UNSAFE_DB(1,   1,    1);
+
+/* burst family - send_flush */
+static int mlx4_send_flush_db(struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__;
+static int mlx4_send_flush_db(struct ibv_qp *ibqp)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+
+	*qp->sdb = qp->doorbell_qpn;
+
+	return 0;
+}
+
+static inline int send_flush_unsafe(struct ibv_qp *ibqp, const int _1thrd_evict, const int wqe64)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+
+	if (qp->last_db_head + 1 == qp->sq.head) {
+		struct mlx4_wqe_ctrl_seg *ctrl = get_send_wqe(qp, qp->last_db_head & (qp->sq.wqe_cnt - 1));
+		int size = ctrl->fence_size & 0x3f;
+
+		/*
+		 * There is no need to check that size > 1 since we get here only
+		 * after using send_pending function, this guarantee that size > 1
+		 */
+		if (wqe64)
+			copy_wqe_to_bf(qp, ctrl, 64, qp->last_db_head,
+				       1, _1thrd_evict);
+		else if (size <= qp->bf_buf_size / 16)
+			copy_wqe_to_bf(qp, ctrl, align(size * 16, 64),
+				       qp->last_db_head,
+				       1, _1thrd_evict);
+		else
+			*qp->sdb = qp->doorbell_qpn;
+	} else {
+		*qp->sdb = qp->doorbell_qpn;
+	}
+	qp->last_db_head = qp->sq.head;
+
+	return 0;
+}
+
+#define MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64) mlx4_send_flush_unsafe_##_1thrd_evict##wqe64
+#define MLX4_SEND_FLUSH_UNSAFE(_1thrd_evict, wqe64)						\
+	static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)(				\
+					struct ibv_qp *ibqp) __MLX4_ALGN_FUNC__;		\
+	static int MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64)(				\
+					struct ibv_qp *ibqp)					\
+	{											\
+		return send_flush_unsafe(ibqp, _1thrd_evict, wqe64);				\
+	}
+
+/*	      _1thrd_evict, wqe64 */
+MLX4_SEND_FLUSH_UNSAFE(0,   0);
+MLX4_SEND_FLUSH_UNSAFE(1,   0);
+MLX4_SEND_FLUSH_UNSAFE(0,   1);
+MLX4_SEND_FLUSH_UNSAFE(1,   1);
+
+/* burst family - recv_burst */
+static inline int recv_burst(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+			     const int thread_safe, const int use_inlne_recv, const int max_one_sge) __attribute__((always_inline));
+static inline int recv_burst(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num,
+			     const int thread_safe, const int use_inlne_recv, const int max_one_sge)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	struct mlx4_inlr_rbuff *rbuffs;
+	unsigned int ind;
+	int i;
+
+	if (thread_safe)
+		mlx4_lock(&qp->rq.lock);
+
+	for (i = 0; i < num; ++i) {
+		ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+		scat = get_recv_wqe(qp, ind);
+		__set_data_seg(scat, sg_list);
+
+		if (!max_one_sge) {
+			scat[1].byte_count = 0;
+			scat[1].lkey       = htonl(MLX4_INVALID_LKEY);
+			scat[1].addr       = 0;
+		}
+
+		if (use_inlne_recv) {
+			rbuffs = qp->inlr_buff.buff[ind].sg_list;
+			qp->inlr_buff.buff[ind].list_len = 1;
+			rbuffs->rbuff = (void *)(unsigned long)(sg_list->addr);
+			rbuffs->rlen = sg_list->length;
+			rbuffs++;
+		}
+		sg_list++;
+		qp->rq.head++;
+	}
+
+	/*
+	 * Make sure that descriptors are written before
+	 * doorbell record.
+	 */
+	wmb();
+
+	*qp->db = htonl(qp->rq.head & 0xffff);
+
+	if (thread_safe)
+		mlx4_unlock(&qp->rq.lock);
+
+	return 0;
+}
+
+static int mlx4_recv_burst_safe(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num) __MLX4_ALGN_FUNC__;
+static int mlx4_recv_burst_safe(struct ibv_qp *ibqp, struct ibv_sge *sg_list, uint32_t num)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+
+	return recv_burst(ibqp, sg_list, num, 1, qp->max_inlr_sg, qp->rq.max_gs == 1);
+}
+#define MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge) mlx4_recv_burst_unsafe_##inlr##_1sge
+#define MLX4_RECV_BURST_UNSAFE(inlr, _1sge)						\
+	static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)(				\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,	\
+					uint32_t num) __MLX4_ALGN_FUNC__;		\
+	static int MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge)(				\
+					struct ibv_qp *ibqp, struct ibv_sge *sg_list,	\
+					uint32_t num)					\
+	{										\
+		return recv_burst(ibqp, sg_list, num, 0, inlr, _1sge);			\
+	}
+/*		       inlr, _1sge */
+MLX4_RECV_BURST_UNSAFE(0,    0);
+MLX4_RECV_BURST_UNSAFE(1,    0);
+MLX4_RECV_BURST_UNSAFE(0,    1);
+MLX4_RECV_BURST_UNSAFE(1,    1);
+
+/*
+ * qp_burst family implementation for safe QP
+ */
+struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_lb = {
+		.send_burst = mlx4_send_burst_safe_lb,
+		.send_pending = mlx4_send_pending_safe_lb,
+		.send_pending_inline = mlx4_send_pending_inl_safe_lb,
+		.send_pending_sg_list = mlx4_send_pending_sg_list_safe_lb,
+		.recv_burst = mlx4_recv_burst_safe,
+		.send_flush = mlx4_send_flush_db
+};
+
+struct ibv_exp_qp_burst_family mlx4_qp_burst_family_safe_no_lb = {
+		.send_burst = mlx4_send_burst_safe_no_lb,
+		.send_pending = mlx4_send_pending_safe_no_lb,
+		.send_pending_inline = mlx4_send_pending_inl_safe_no_lb,
+		.send_pending_sg_list = mlx4_send_pending_sg_list_safe_no_lb,
+		.recv_burst = mlx4_recv_burst_safe,
+		.send_flush = mlx4_send_flush_db
+};
+
+/*
+ * qp_burst family implementation table for unsafe QP
+ */
+#define MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)	\
+		(lb << 5 | _1thrd_evict << 4 | eth << 3 | wqe64 << 2 | inlr << 1 | _1sge)
+
+#define MLX4_QP_BURST_UNSAFE_TBL_ENTRY(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)			\
+	[MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)] = {			\
+		.send_burst		= MLX4_SEND_BURST_UNSAFE_NAME(_1thrd_evict, eth, wqe64, lb),	\
+		.send_pending		= MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb),		\
+		.send_pending_inline	= MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb),		\
+		.send_pending_sg_list	= MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb),	\
+		.recv_burst		= MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge),			\
+		.send_flush		= MLX4_SEND_FLUSH_UNSAFE_NAME(_1thrd_evict, wqe64),		\
+	}
+static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_tbl[1 << 6] = {
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 0, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 0, 1, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 0, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(0, 1, 1, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 0, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 0, 1, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 0, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_TBL_ENTRY(1, 1, 1, 1, 1, 1),
+};
+
+#define MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)	\
+		(lb << 4 | eth << 3 | wqe64 << 2 | inlr << 1 | _1sge)
+
+#define MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(lb, eth, wqe64, inlr, _1sge)				\
+	[MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)] = {				\
+		.send_burst		= MLX4_SEND_BURST_UNSAFE_DB_NAME(eth, wqe64, lb),		\
+		.send_pending		= MLX4_SEND_PENDING_UNSAFE_NAME(eth, wqe64, lb),		\
+		.send_pending_inline	= MLX4_SEND_PENDING_INL_UNSAFE_NAME(eth, wqe64, lb),		\
+		.send_pending_sg_list	= MLX4_SEND_PENDING_SG_LIST_UNSAFE_NAME(eth, wqe64, lb),	\
+		.recv_burst		= MLX4_RECV_BURST_UNSAFE_NAME(inlr, _1sge),			\
+		.send_flush		= mlx4_send_flush_db,						\
+	}
+static struct ibv_exp_qp_burst_family mlx4_qp_burst_family_unsafe_db_tbl[1 << 5] = {
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 0, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(0, 1, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 0, 1, 1, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 0, 1, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 0, 1),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 0),
+		MLX4_QP_BURST_UNSAFE_DB_TBL_ENTRY(1, 1, 1, 1, 1),
+};
+
+struct ibv_exp_qp_burst_family *mlx4_get_qp_burst_family(struct mlx4_qp *qp,
+							 struct ibv_exp_query_intf_params *params,
+							 enum ibv_exp_query_intf_status *status)
+{
+	enum ibv_exp_query_intf_status ret = IBV_EXP_INTF_STAT_OK;
+	struct ibv_exp_qp_burst_family *family = NULL;
+	uint32_t unsupported_f;
+
+	if ((qp->verbs_qp.qp.state < IBV_QPS_INIT) || (qp->verbs_qp.qp.state > IBV_QPS_RTS)) {
+			*status = IBV_EXP_INTF_STAT_INVAL_OBJ_STATE;
+			return NULL;
+	}
+
+	if (params->flags) {
+		fprintf(stderr, PFX "Global interface flags(0x%x) are not supported for QP family\n", params->flags);
+		*status = IBV_EXP_INTF_STAT_FLAGS_NOT_SUPPORTED;
+
+		return NULL;
+	}
+	unsupported_f = params->family_flags & ~(IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK |
+						 IBV_EXP_QP_BURST_CREATE_ENABLE_MULTI_PACKET_SEND_WR);
+	if (unsupported_f) {
+		fprintf(stderr, PFX "Family flags(0x%x) are not supported for QP family\n", unsupported_f);
+		*status = IBV_EXP_INTF_STAT_FAMILY_FLAGS_NOT_SUPPORTED;
+
+		return NULL;
+	}
+
+	switch (qp->qp_type) {
+	case IBV_QPT_RC:
+	case IBV_QPT_UC:
+	case IBV_QPT_RAW_PACKET:
+		if (qp->model_flags & MLX4_QP_MODEL_FLAG_THREAD_SAFE) {
+			int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK);
+
+			if (lb)
+				family = &mlx4_qp_burst_family_safe_lb;
+			else
+				family = &mlx4_qp_burst_family_safe_no_lb;
+		} else {
+			int eth = qp->qp_type == IBV_QPT_RAW_PACKET &&
+				  qp->link_layer == IBV_LINK_LAYER_ETHERNET;
+			int wqe64 = qp->sq.wqe_shift == 6;
+			int inlr = qp->max_inlr_sg != 0;
+			int _1sge = qp->rq.max_gs == 1;
+			int _1thrd_evict = qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB ||
+					   qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+			int lb = !(params->family_flags & IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK);
+
+			if (qp->db_method == MLX4_QP_DB_METHOD_DB)
+				family = &mlx4_qp_burst_family_unsafe_db_tbl
+					[MLX4_QP_BURST_UNSAFE_DB_TBL_IDX(lb, eth, wqe64, inlr, _1sge)];
+			else
+				family = &mlx4_qp_burst_family_unsafe_tbl
+					[MLX4_QP_BURST_UNSAFE_TBL_IDX(lb, _1thrd_evict, eth, wqe64, inlr, _1sge)];
+		}
+		break;
+
+	default:
+		ret = IBV_EXP_INTF_STAT_INVAL_PARARM;
+		break;
+	}
+
+	*status = ret;
+
+	return family;
+}
Index: contrib/ofed/libmlx4/src/srq.c
===================================================================
--- contrib/ofed/libmlx4/src/srq.c
+++ contrib/ofed/libmlx4/src/srq.c
@@ -42,6 +42,7 @@
 #include "mlx4.h"
 #include "doorbell.h"
 #include "wqe.h"
+#include "mlx4-abi.h"
 
 static void *get_wqe(struct mlx4_srq *srq, int n)
 {
@@ -52,38 +53,43 @@
 {
 	struct mlx4_wqe_srq_next_seg *next;
 
-	pthread_spin_lock(&srq->lock);
+	mlx4_spin_lock(&srq->lock);
 
 	next = get_wqe(srq, srq->tail);
 	next->next_wqe_index = htons(ind);
 	srq->tail = ind;
 
-	pthread_spin_unlock(&srq->lock);
+	mlx4_spin_unlock(&srq->lock);
 }
 
 int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
 		       struct ibv_recv_wr *wr,
 		       struct ibv_recv_wr **bad_wr)
 {
-	struct mlx4_srq *srq = to_msrq(ibsrq);
+	struct mlx4_srq *srq;
 	struct mlx4_wqe_srq_next_seg *next;
 	struct mlx4_wqe_data_seg *scat;
 	int err = 0;
 	int nreq;
 	int i;
 
-	pthread_spin_lock(&srq->lock);
+	if (ibsrq->handle == LEGACY_XRC_SRQ_HANDLE)
+		ibsrq = (struct ibv_srq *)(((struct ibv_srq_legacy *) ibsrq)->ibv_srq);
 
+	srq = to_msrq(ibsrq);
+	mlx4_spin_lock(&srq->lock);
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (wr->num_sge > srq->max_gs) {
-			err = -1;
+			errno = EINVAL;
+			err = errno;
 			*bad_wr = wr;
 			break;
 		}
 
 		if (srq->head == srq->tail) {
 			/* SRQ is full*/
-			err = -1;
+			errno = ENOMEM;
+			err = errno;
 			*bad_wr = wr;
 			break;
 		}
@@ -119,7 +125,7 @@
 		*srq->db = htonl(srq->counter);
 	}
 
-	pthread_spin_unlock(&srq->lock);
+	mlx4_spin_unlock(&srq->lock);
 
 	return err;
 }
@@ -174,52 +180,153 @@
 	return 0;
 }
 
-struct mlx4_srq *mlx4_find_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size)
 {
-	int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
+	memset(xsrq_table, 0, sizeof *xsrq_table);
+	xsrq_table->num_xsrq = size;
+	xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS;
+	xsrq_table->mask = (1 << xsrq_table->shift) - 1;
 
-	if (ctx->xrc_srq_table[tind].refcnt)
-		return ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask];
-	else
-		return NULL;
+	pthread_mutex_init(&xsrq_table->mutex, NULL);
 }
 
-int mlx4_store_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn,
-		       struct mlx4_srq *srq)
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
 {
-	int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
-	int ret = 0;
+	int index;
 
-	pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
+	index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+	if (xsrq_table->xsrq_table[index].refcnt)
+		return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask];
+
+	return NULL;
+}
 
-	if (!ctx->xrc_srq_table[tind].refcnt) {
-		ctx->xrc_srq_table[tind].table = calloc(ctx->xrc_srq_table_mask + 1,
-							sizeof(struct mlx4_srq *));
-		if (!ctx->xrc_srq_table[tind].table) {
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+		    struct mlx4_srq *srq)
+{
+	int index, ret = 0;
+
+	index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+	pthread_mutex_lock(&xsrq_table->mutex);
+	if (!xsrq_table->xsrq_table[index].refcnt) {
+		xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1,
+							     sizeof(struct mlx4_srq *));
+		if (!xsrq_table->xsrq_table[index].table) {
 			ret = -1;
 			goto out;
 		}
 	}
 
-	++ctx->xrc_srq_table[tind].refcnt;
-	ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = srq;
+	xsrq_table->xsrq_table[index].refcnt++;
+	xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq;
 
 out:
-	pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
+	pthread_mutex_unlock(&xsrq_table->mutex);
 	return ret;
 }
 
-void mlx4_clear_xrc_srq(struct mlx4_context *ctx, uint32_t xrc_srqn)
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
 {
-	int tind = (xrc_srqn & (ctx->num_xrc_srqs - 1)) >> ctx->xrc_srq_table_shift;
+	int index;
 
-	pthread_mutex_lock(&ctx->xrc_srq_table_mutex);
+	index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+	pthread_mutex_lock(&xsrq_table->mutex);
 
-	if (!--ctx->xrc_srq_table[tind].refcnt)
-		free(ctx->xrc_srq_table[tind].table);
+	if (--xsrq_table->xsrq_table[index].refcnt)
+		xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL;
 	else
-		ctx->xrc_srq_table[tind].table[xrc_srqn & ctx->xrc_srq_table_mask] = NULL;
+		free(xsrq_table->xsrq_table[index].table);
+
+	pthread_mutex_unlock(&xsrq_table->mutex);
+}
+
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+				    struct ibv_srq_init_attr_ex *attr_ex)
+{
+	struct mlx4_create_xsrq cmd;
+	struct mlx4_create_srq_resp resp;
+	struct mlx4_srq *srq;
+	int ret;
 
-	pthread_mutex_unlock(&ctx->xrc_srq_table_mutex);
+	/* Sanity check SRQ size before proceeding */
+	if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64)
+		return NULL;
+
+	srq = calloc(1, sizeof *srq);
+	if (!srq)
+		return NULL;
+
+	if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded))
+		goto err;
+
+	srq->max     = align_queue_size(attr_ex->attr.max_wr + 1);
+	srq->max_gs  = attr_ex->attr.max_sge;
+	srq->counter = 0;
+	srq->ext_srq = 1;
+
+	if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq))
+		goto err;
+
+	srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+	if (!srq->db)
+		goto err_free;
+
+	*srq->db = 0;
+
+	cmd.buf_addr = (uintptr_t) srq->buf.buf;
+	cmd.db_addr  = (uintptr_t) srq->db;
+
+	ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, sizeof(srq->verbs_srq),
+					attr_ex,
+					&cmd.ibv_cmd, sizeof cmd,
+					&resp.ibv_resp, sizeof resp);
+	if (ret)
+		goto err_db;
+
+	ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table,
+			      srq->verbs_srq.srq_num, srq);
+	if (ret)
+		goto err_destroy;
+
+	return &srq->verbs_srq.srq;
+
+err_destroy:
+	ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
+err_db:
+	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db);
+err_free:
+	free(srq->wrid);
+	mlx4_free_buf(&srq->buf);
+err:
+	free(srq);
+	return NULL;
 }
 
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq)
+{
+	struct mlx4_context *mctx = to_mctx(srq->context);
+	struct mlx4_srq *msrq = to_msrq(srq);
+	struct mlx4_cq *mcq;
+	int ret;
+
+	mcq = to_mcq(msrq->verbs_srq.cq);
+	mlx4_cq_clean(mcq, 0, msrq);
+	mlx4_lock(&mcq->lock);
+	mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num);
+	mlx4_unlock(&mcq->lock);
+
+	ret = ibv_cmd_destroy_srq(srq);
+	if (ret) {
+		mlx4_lock(&mcq->lock);
+		mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq);
+		mlx4_unlock(&mcq->lock);
+		return ret;
+	}
+
+	mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db);
+	mlx4_free_buf(&msrq->buf);
+	free(msrq->wrid);
+	free(msrq);
+
+	return 0;
+}
Index: contrib/ofed/libmlx4/src/verbs.c
===================================================================
--- contrib/ofed/libmlx4/src/verbs.c
+++ contrib/ofed/libmlx4/src/verbs.c
@@ -40,38 +40,130 @@
 #include <pthread.h>
 #include <errno.h>
 #include <netinet/in.h>
-
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+/* Added for reg_mr mmap munmap system calls */
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sched.h>
+#include <glob.h>
 #include "mlx4.h"
 #include "mlx4-abi.h"
+#include "mlx4_exp.h"
 #include "wqe.h"
 
+#define SHARED_MR_PROC_DIR_NAME "/proc/driver/mlx4_ib/mrs"
+#define FPATH_MAX 128
+
+int __mlx4_query_device(uint64_t raw_fw_ver,
+			struct ibv_device_attr *attr)
+{
+	unsigned major, minor, sub_minor;
+
+	major     = (raw_fw_ver >> 32) & 0xffff;
+	minor     = (raw_fw_ver >> 16) & 0xffff;
+	sub_minor = raw_fw_ver & 0xffff;
+
+	snprintf(attr->fw_ver, sizeof attr->fw_ver,
+		 "%d.%d.%03d", major, minor, sub_minor);
+
+	return 0;
+}
+
 int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
 {
 	struct ibv_query_device cmd;
 	uint64_t raw_fw_ver;
-	unsigned major, minor, sub_minor;
 	int ret;
 
-	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
+	read_init_vars(to_mctx(context));
+	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd,
+				   sizeof(cmd));
 	if (ret)
 		return ret;
 
-	major     = (raw_fw_ver >> 32) & 0xffff;
-	minor     = (raw_fw_ver >> 16) & 0xffff;
-	sub_minor = raw_fw_ver & 0xffff;
+	return __mlx4_query_device(raw_fw_ver, attr);
+}
 
-	snprintf(attr->fw_ver, sizeof attr->fw_ver,
-		 "%d.%d.%03d", major, minor, sub_minor);
+#define READL(ptr) (*((uint32_t *)(ptr)))
+
+static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles)
+{
+	unsigned int clockhi, clocklo, clockhi1;
+	int i;
+	struct mlx4_context *ctx = to_mctx(context);
+
+	if (ctx->hca_core_clock == NULL)
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < 10; i++) {
+		clockhi = ntohl(READL(ctx->hca_core_clock));
+		clocklo = ntohl(READL(ctx->hca_core_clock + 4));
+		clockhi1 = ntohl(READL(ctx->hca_core_clock));
+		if (clockhi == clockhi1)
+			break;
+	}
+
+	if (clocklo == 0)
+		clockhi++;
+
+	*cycles = (uint64_t) clockhi << 32 | (uint64_t) clocklo;
 
 	return 0;
 }
+int mlx4_query_values(struct ibv_context *context, int q_values,
+		      struct ibv_exp_values *values)
+{
+	struct mlx4_context *ctx = to_mctx(context);
+	uint64_t cycles;
+	int err;
+	uint32_t comp_mask = values->comp_mask;
+
+	values->comp_mask = 0;
+
+	if (q_values & (IBV_EXP_VALUES_HW_CLOCK | IBV_EXP_VALUES_HW_CLOCK_NS)) {
+		err = mlx4_read_clock(context, &cycles);
+		if (!err) {
+			if (comp_mask & IBV_EXP_VALUES_HW_CLOCK) {
+				values->hwclock = cycles;
+				values->comp_mask |= IBV_EXP_VALUES_HW_CLOCK;
+			}
+			if (q_values & IBV_EXP_VALUES_HW_CLOCK_NS) {
+				if (comp_mask & IBV_EXP_VALUES_HW_CLOCK_NS) {
+					values->hwclock_ns =
+						((uint64_t)values->hwclock *
+						 ctx->core_clk.mult)
+						>> ctx->core_clk.shift;
+					values->comp_mask |= IBV_EXP_VALUES_HW_CLOCK_NS;
+				}
+			}
+		}
+	}
+	return 0;
+}
 
 int mlx4_query_port(struct ibv_context *context, uint8_t port,
 		     struct ibv_port_attr *attr)
 {
 	struct ibv_query_port cmd;
+	int err;
+
+	read_init_vars(to_mctx(context));
+	err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
+	if (!err && port <= MLX4_PORTS_NUM && port > 0) {
+		struct mlx4_context *mctx = to_mctx(context);
+		if (!mctx->port_query_cache[port - 1].valid) {
+			mctx->port_query_cache[port - 1].link_layer =
+				attr->link_layer;
+			mctx->port_query_cache[port - 1].caps =
+				attr->port_cap_flags;
+			mctx->port_query_cache[port - 1].valid = 1;
+		}
+	}
 
-	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
+	return err;
 }
 
 struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context)
@@ -80,6 +172,7 @@
 	struct mlx4_alloc_pd_resp resp;
 	struct mlx4_pd		 *pd;
 
+	read_init_vars(to_mctx(context));
 	pd = malloc(sizeof *pd);
 	if (!pd)
 		return NULL;
@@ -107,50 +200,570 @@
 	return 0;
 }
 
-struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
-			   enum ibv_access_flags access)
+
+static void mlx4_free_mr(struct mlx4_mr *mlx4_mr)
+{
+	/*  mr address was allocated in speical mode - freed accordingly */
+	if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR ||
+		mlx4_mr->shared_mr)
+		mlx4_free_buf(&(mlx4_mr->buf));
+
+	/* Finally we free the structure itself  */
+	free(mlx4_mr);
+}
+
+
+static void *mlx4_get_contiguous_alloc_fallback(struct mlx4_buf *buf,
+	struct ibv_pd *pd, size_t length)
+{
+
+	/* We allocate as fallback mode non contiguous pages*/
+	if (mlx4_alloc_buf(
+			buf,
+			align(length, to_mdev(pd->context->device)->page_size),
+			to_mdev(pd->context->device)->page_size))
+		return NULL;
+
+	return buf->buf;
+}
+
+
+/*  We'll call mmap on mlx4_ib module to achieve this task */
+static void *mlx4_get_contiguous_alloc(struct mlx4_buf *mlx4_buf,
+						struct ibv_pd *pd,
+						size_t length,
+						void *contig_addr)
+{
+	size_t alloc_length;
+	int page_size;
+	int mr_no_allocator = 0;
+	int mr_force_contig_pages = 0;
+	enum mlx4_alloc_type alloc_type;
+
+	mlx4_get_alloc_type(pd->context, MLX4_MR_PREFIX, &alloc_type,
+			    MLX4_ALLOC_TYPE_ALL);
+
+	if (alloc_type == MLX4_ALLOC_TYPE_CONTIG)
+		mr_force_contig_pages = 1;
+	else if (alloc_type == MLX4_ALLOC_TYPE_ANON)
+		mr_no_allocator = 1;
+
+	/* For benchmarking purposes we apply an option to turn off continuous
+	     allocator based on environment variable
+	*/
+	if (mr_no_allocator)
+		return mlx4_get_contiguous_alloc_fallback(mlx4_buf, pd,
+				length);
+
+	page_size = to_mdev(pd->context->device)->page_size;
+	alloc_length = (contig_addr ? length : align(length, page_size));
+	if (!(mlx4_alloc_buf_contig(to_mctx(pd->context),
+				mlx4_buf, alloc_length,
+				page_size, MLX4_MR_PREFIX, contig_addr)))
+		return contig_addr ? contig_addr : mlx4_buf->buf;
+
+	if (mr_force_contig_pages || contig_addr)
+		return NULL;
+
+	return mlx4_get_contiguous_alloc_fallback(mlx4_buf,
+						pd, length);
+
+}
+
+static int mlx4_get_shared_mr_name(char *in_pattern, char *file_name)
+{
+	glob_t results;
+	int ret;
+
+	ret = glob(in_pattern, 0, NULL, &results);
+
+	if (ret) {
+		if (mlx4_trace)
+			/* might be some legacy kernel with old mode */
+			fprintf(stderr, "mlx4_get_shared_mr_name: glob failed for %s, ret=%d, errno=%d\n",
+				in_pattern, ret, errno);
+		return ret;
+	}
+
+	if (results.gl_pathc > 1) {
+		int i;
+		int duplicate_name = 1;
+
+		/* we encountered an issue where glob retuned same name twice, we suspect it to be
+		  * an issue with glob/procfs. When there is more than one entry check whether all entries
+		  * are the same in that case API succeeded and we use first entry name.
+		*/
+		for (i = 1; i < results.gl_pathc; i++) {
+			if (strcmp(results.gl_pathv[0], results.gl_pathv[i])) {
+				duplicate_name = 0;
+				break;
+			}
+		}
+
+		if (!duplicate_name) {
+			fprintf(stderr, "mlx4_get_shared_mr_name failed for %s, unexpected %lu paths were found\n",
+				in_pattern, (unsigned long)(results.gl_pathc));
+			for (i = 0; i < results.gl_pathc; i++)
+				fprintf(stderr, "mlx4_get_shared_mr_name: path#%d=%s\n", i,
+					results.gl_pathv[i]);
+			globfree(&results);
+			return -EINVAL;
+		}
+	}
+
+	strncpy(file_name, results.gl_pathv[0], FPATH_MAX);
+	file_name[FPATH_MAX - 1] = '\0';
+	globfree(&results);
+	return 0;
+}
+
+struct ibv_mr *mlx4_reg_shared_mr(struct ibv_exp_reg_shared_mr_in *in)
+{
+	struct ibv_context *context;
+	size_t total_size;
+	int page_size;
+	char shared_mr_file_name[FPATH_MAX];
+	char shared_mr_pattern[FPATH_MAX];
+	int fd;
+	struct stat buffer;
+	int status;
+	struct ibv_mr *ibv_mr;
+	uint64_t shared_flags;
+	struct mlx4_mr *mlx4_mr = NULL;
+	void *addr = in->addr;
+	uint64_t access = in->exp_access;
+	struct ibv_exp_reg_mr_in rmr_in;
+	int flags;
+	int ret;
+	int is_writeable_mr = !!(access & (IBV_EXP_ACCESS_REMOTE_WRITE |
+			IBV_EXP_ACCESS_LOCAL_WRITE | IBV_EXP_ACCESS_REMOTE_ATOMIC));
+
+	context = in->pd->context;
+	page_size = to_mdev(context->device)->page_size;
+	sprintf(shared_mr_pattern, "%s/%X.*",
+		SHARED_MR_PROC_DIR_NAME, in->mr_handle);
+
+	ret = mlx4_get_shared_mr_name(shared_mr_pattern, shared_mr_file_name);
+	if (ret)
+		/* For compatability issue trying with legacy name */
+		sprintf(shared_mr_file_name, "%s/%X",
+			SHARED_MR_PROC_DIR_NAME, in->mr_handle);
+
+	flags = is_writeable_mr ? O_RDWR : O_RDONLY;
+	fd = open(shared_mr_file_name, flags);
+	if (fd < 0) {
+		int counter = 10;
+		/* retrying for 1 second before reporting an error */
+		while (fd < 0 && counter > 0) {
+			usleep(100000);
+			counter--;
+			fd = open(shared_mr_file_name, flags);
+		}
+
+		if (fd < 0) {
+			fprintf(stderr, "mlx4_reg_shared_mr failed open %s errno=%d\n",
+				shared_mr_file_name, errno);
+			return NULL;
+		}
+	}
+
+	status = fstat(fd, &buffer);
+	if (status) {
+		fprintf(stderr,
+			"mlx4_reg_shared_mr lstat has failed , errno=%d\n",
+			errno);
+		goto error;
+	}
+
+	total_size = align(buffer.st_size, page_size);
+
+	/* set protection based on access flags input address may be NULL
+	     or other recommended address by the application.
+	*/
+	addr = mmap(addr , total_size,
+		    is_writeable_mr ? (PROT_WRITE | PROT_READ) :
+				PROT_READ, MAP_SHARED,
+				fd,
+				0);
+
+	/* On a failure  MAP_FAILED (that is, (void *) -1) is returned*/
+	if (addr == MAP_FAILED) {
+		fprintf(stderr,
+			"mlx4_reg_shared_mr mmap has failed , errno=%d\n",
+			errno);
+		goto error;
+	}
+
+	if (ibv_dontfork_range(addr, total_size)) {
+		fprintf(stderr,
+			"mlx4_reg_shared_mr dontfork has failed , errno=%d\n",
+			errno);
+		goto err_unmap;
+	}
+
+	if (access & IBV_EXP_ACCESS_NO_RDMA) {
+		mlx4_mr = calloc(1, sizeof *mlx4_mr);
+		if (!mlx4_mr)
+			goto err_dofork;
+
+		mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_NO_RDMA;
+		ibv_mr = &(mlx4_mr->ibv_mr);
+		ibv_mr->context = in->pd->context;
+
+	} else {
+		/* Make sure that  shared access flags are off before
+		     calling to reg_mr, otherwise new mr will be shared as well.
+		*/
+		shared_flags = IBV_EXP_ACCESS_SHARED_MR_USER_READ |
+				IBV_EXP_ACCESS_SHARED_MR_USER_WRITE |
+				IBV_EXP_ACCESS_SHARED_MR_GROUP_READ |
+				IBV_EXP_ACCESS_SHARED_MR_GROUP_WRITE |
+				IBV_EXP_ACCESS_SHARED_MR_OTHER_READ |
+				IBV_EXP_ACCESS_SHARED_MR_OTHER_WRITE;
+
+		access &= ~shared_flags;
+		rmr_in.pd = in->pd;
+		rmr_in.addr = addr;
+		rmr_in.length = total_size;
+		rmr_in.exp_access = access;
+		rmr_in.comp_mask = 0;
+
+		ibv_mr = mlx4_exp_reg_mr(&rmr_in);
+		if (!ibv_mr)
+			goto err_dofork;
+	}
+
+	/* file should be closed - not required any more */
+	close(fd);
+
+	ibv_mr->length = total_size;
+	ibv_mr->addr = addr;
+	mlx4_mr = to_mmr(ibv_mr);
+	/* We mark this MR as shared one to be handled correctly via dereg_mr*/
+	mlx4_mr->shared_mr = 1;
+	/* We hook addr & length also internally for further
+	     use via dreg_mr.
+	*/
+	mlx4_mr->buf.buf = addr;
+	mlx4_mr->buf.length = total_size;
+	return ibv_mr;
+
+err_dofork:
+	ibv_dofork_range(addr, total_size);
+err_unmap:
+	munmap(addr, total_size);
+error:
+	close(fd);
+	return NULL;
+}
+
+int mlx4_exp_dereg_mr(struct ibv_mr *mr, struct ibv_exp_dereg_out *out)
+{
+	struct mlx4_mr *mlx4_mr = to_mmr(mr);
+
+	out->need_dofork = (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR ||
+			    mlx4_mr->shared_mr) ? 0 : 1;
+
+	return mlx4_dereg_mr(mr);
+}
+
+int mlx4_exp_rereg_mr(struct ibv_mr *mr,
+		      int flags,
+		      struct ibv_pd *pd, void *addr,
+		      size_t length, uint64_t access,
+		      struct ibv_exp_rereg_mr_attr *attr,
+		      struct ibv_exp_rereg_out *out)
+{
+	struct mlx4_mr *mlx4_mr = to_mmr(mr);
+	struct mlx4_buf buf;
+	struct ibv_exp_rereg_mr cmd;
+	struct ibv_exp_rereg_mr_resp resp;
+	int internal_alloc = 0;
+	int ret;
+
+	if (flags & (~IBV_EXP_REREG_MR_FLAGS_SUPPORTED | IBV_EXP_REREG_MR_KEEP_VALID))
+		return -EINVAL;
+
+	/* Currently, we don't support any features in comp_mask */
+	if (attr->comp_mask)
+		return -EINVAL;
+
+	/* Here we check whether contigous pages are required and
+	    should be allocated internally.
+	*/
+
+	memset(&buf, 0, sizeof(buf));
+	if ((flags & IBV_EXP_REREG_MR_CHANGE_ACCESS) &&
+	    !addr && (access & IBV_EXP_ACCESS_ALLOCATE_MR)) {
+		struct ibv_pd *curr_pd = flags & IBV_EXP_REREG_MR_CHANGE_PD ? pd : mr->pd;
+		addr = mlx4_get_contiguous_alloc(&buf, curr_pd, length, NULL);
+		if (!addr)
+			return -ENOMEM;
+
+		internal_alloc = 1;
+	}
+
+	ret = ibv_exp_cmd_rereg_mr(mr, flags, addr, length,
+			       (uintptr_t) addr,
+			       access, pd, attr,
+			       &cmd, sizeof(cmd), 0,
+			       &resp, sizeof(resp), 0);
+
+	if (ret) {
+		if (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION)
+			mlx4_free_buf(&buf);
+		return ret;
+	} else {
+		if (((mlx4_mr->allocation_flags & IBV_EXP_ACCESS_ALLOCATE_MR) ||
+		     mlx4_mr->shared_mr) &&
+		    (flags & IBV_EXP_REREG_MR_CHANGE_TRANSLATION)) {
+			mlx4_mr->shared_mr = 0;
+			mlx4_free_buf(&(mlx4_mr->buf));
+			/* The memory was just freed, mark it as NULL */
+			mlx4_mr->ibv_mr.addr = NULL;
+			mlx4_mr->allocation_flags &= ~IBV_EXP_ACCESS_ALLOCATE_MR;
+			out->need_dofork = 0;
+		}
+		if (internal_alloc) {
+			mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_ALLOCATE_MR;
+			/* Address is returned to libibverbs through pointer to
+			 * pointer mechanism
+			 */
+			mlx4_mr->ibv_mr.addr = addr;
+			mlx4_mr->ibv_mr.length = length;
+			memcpy(&mlx4_mr->buf, &buf, sizeof(mlx4_mr->buf));
+		}
+	}
+
+	return ret;
+}
+
+
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+				struct ibv_xrcd_init_attr *attr)
+{
+	struct ibv_open_xrcd cmd;
+	struct ibv_open_xrcd_resp resp;
+	struct verbs_xrcd *xrcd;
+	int ret;
+
+	xrcd = calloc(1, sizeof *xrcd);
+	if (!xrcd)
+		return NULL;
+
+	ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
+				&cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		goto err;
+
+	return &xrcd->xrcd;
+
+err:
+	free(xrcd);
+	return NULL;
+}
+
+int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
 {
-	struct ibv_mr *mr;
+	struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
+	int ret;
+
+	ret = ibv_cmd_close_xrcd(xrcd);
+	if (!ret)
+		free(xrcd);
+
+	return ret;
+}
+
+struct ibv_mr *mlx4_exp_reg_mr(struct ibv_exp_reg_mr_in *in)
+{
+
+	struct mlx4_mr *mlx4_mr;
 	struct ibv_reg_mr cmd;
 	int ret;
+	int cmd_access;
+	int is_contig;
+
+	if ((in->comp_mask > IBV_EXP_REG_MR_RESERVED - 1) ||
+	    (in->exp_access > IBV_EXP_ACCESS_RESERVED - 1)) {
+		errno = EINVAL;
+		return NULL;
+	}
 
-	mr = malloc(sizeof *mr);
-	if (!mr)
+	mlx4_mr = calloc(1, sizeof *mlx4_mr);
+	if (!mlx4_mr)
 		return NULL;
 
+	VALGRIND_MAKE_MEM_DEFINED(&in->create_flags, sizeof(in->create_flags));
+	is_contig = ((in->exp_access & IBV_EXP_ACCESS_ALLOCATE_MR) && !in->addr) ||
+		    ((in->comp_mask & IBV_EXP_REG_MR_CREATE_FLAGS) &&
+		     (in->create_flags & IBV_EXP_REG_MR_CREATE_CONTIG));
+	/* Here we check whether contigous pages are required and
+	    should be allocated internally.
+	*/
+	if (is_contig) {
+		in->addr = mlx4_get_contiguous_alloc(&mlx4_mr->buf, in->pd,
+						     in->length, in->addr);
+		if (!in->addr) {
+			free(mlx4_mr);
+			return NULL;
+		}
+
+		mlx4_mr->allocation_flags |= IBV_EXP_ACCESS_ALLOCATE_MR;
+		/* Hooking the addr on returned pointer for
+		     further use by application.
+		*/
+		mlx4_mr->ibv_mr.addr = in->addr;
+	}
+
+	cmd_access = (in->exp_access & (IBV_EXP_START_FLAG - 1)) |
+		     (in->exp_access & (IBV_EXP_ACCESS_RESERVED - 1)) >> IBV_EXP_START_FLAG_LOC;
 #ifdef IBV_CMD_REG_MR_HAS_RESP_PARAMS
 	{
 		struct ibv_reg_mr_resp resp;
 
-		ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
-				     access, mr, &cmd, sizeof cmd,
-				     &resp, sizeof resp);
+		ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length,
+				     (uintptr_t) in->addr, cmd_access,
+				     &(mlx4_mr->ibv_mr),
+				     &cmd, sizeof(cmd),
+				     &resp, sizeof(resp));
 	}
 #else
-	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, access, mr,
-			     &cmd, sizeof cmd);
+	ret = ibv_cmd_reg_mr(in->pd, in->addr, in->length,
+			     (uintptr_t) in->addr, cmd_access,
+			     &(mlx4_mr->ibv_mr),
+			     &cmd, sizeof(cmd));
 #endif
 	if (ret) {
-		free(mr);
+		mlx4_free_mr(mlx4_mr);
 		return NULL;
 	}
 
-	return mr;
+	return &(mlx4_mr->ibv_mr);
+}
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
+			   size_t length, int access)
+{
+	struct ibv_exp_reg_mr_in in;
+
+	in.pd = pd;
+	in.addr = addr;
+	in.length = length;
+	in.exp_access = access;
+	in.comp_mask = 0;
+
+	return mlx4_exp_reg_mr(&in);
 }
 
 int mlx4_dereg_mr(struct ibv_mr *mr)
 {
 	int ret;
+	struct mlx4_mr *mlx4_mr = to_mmr(mr);
+
+	if (mlx4_mr->allocation_flags & IBV_EXP_ACCESS_NO_RDMA)
+		goto free_mr;
 
 	ret = ibv_cmd_dereg_mr(mr);
 	if (ret)
 		return ret;
+free_mr:
+	mlx4_free_mr(mlx4_mr);
+	return 0;
+}
+
+struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
+{
+	struct verbs_mw *vmw;
+	struct ibv_alloc_mw cmd;
+	struct ibv_alloc_mw_resp resp;
+	int ret;
+
+	vmw = malloc(sizeof(*vmw));
+	if (!vmw)
+		return NULL;
+	memset(vmw, 0, sizeof(*vmw));
+
+	ret = ibv_cmd_alloc_mw(pd, type, vmw, &cmd, sizeof(cmd),
+			     &resp, sizeof(resp));
+
+	if (ret) {
+		free(vmw);
+		return NULL;
+	}
+	vmw->type = type;
+
+	return &vmw->mw;
+}
+
+int mlx4_dealloc_mw(struct ibv_mw *mw)
+{
+	int ret;
+	struct ibv_dealloc_mw cmd;
+	struct verbs_mw *vmw = (struct verbs_mw *)mw;
+
+	ret = ibv_cmd_dealloc_mw(vmw, &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
+
+	free(vmw);
+	return 0;
+}
+
+int __mlx4_bind_mw(struct ibv_exp_mw_bind *mw_bind)
+{
+	int ret;
+	struct ibv_exp_send_wr *bad_wr = NULL;
+	struct ibv_exp_send_wr wr = { };
+
+	wr.exp_opcode = IBV_EXP_WR_BIND_MW;
+	wr.next = NULL;
+
+	wr.wr_id = mw_bind->wr_id;
+	wr.exp_send_flags = mw_bind->exp_send_flags;
+
+	wr.bind_mw.mw = mw_bind->mw;
+	wr.bind_mw.rkey = ibv_inc_rkey(mw_bind->mw->rkey);
+	wr.bind_mw.bind_info = mw_bind->bind_info;
+
+	ret = mlx4_exp_post_send(mw_bind->qp, &wr, &bad_wr);
+
+	if (ret)
+		return ret;
+
+	/* updating the mw with the latest rkey. */
+	mw_bind->mw->rkey = wr.bind_mw.rkey;
 
-	free(mr);
 	return 0;
 }
 
-static int align_queue_size(int req)
+int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+		 struct ibv_mw_bind *mw_bind)
+{
+	struct ibv_exp_mw_bind exp_mw_bind;
+
+	memset(&exp_mw_bind, 0, sizeof(exp_mw_bind));
+	exp_mw_bind.qp				= qp;
+	exp_mw_bind.exp_send_flags		= mw_bind->send_flags;
+	exp_mw_bind.wr_id			= mw_bind->wr_id;
+	exp_mw_bind.bind_info.addr		= (uint64_t)(uintptr_t)mw_bind->addr;
+	exp_mw_bind.bind_info.length		= mw_bind->length;
+	exp_mw_bind.bind_info.mr		= mw_bind->mr;
+	exp_mw_bind.bind_info.exp_mw_access_flags = mw_bind->mw_access_flags;
+	exp_mw_bind.comp_mask			= 0;
+
+	return __mlx4_bind_mw(&exp_mw_bind);
+
+}
+
+int mlx4_exp_bind_mw(struct ibv_exp_mw_bind *mw_bind)
+{
+	if (mw_bind->comp_mask > IBV_EXP_BIND_MW_RESERVED - 1)
+		return EINVAL;
+	return __mlx4_bind_mw(mw_bind);
+}
+
+int align_queue_size(int req)
 {
 	int nent;
 
@@ -160,36 +773,52 @@
 	return nent;
 }
 
-struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
-			       struct ibv_comp_channel *channel,
-			       int comp_vector)
+static struct ibv_cq *create_cq(struct ibv_context *context,
+				int cqe,
+				struct ibv_comp_channel *channel,
+				int comp_vector,
+				struct ibv_exp_cq_init_attr *attr)
 {
-	struct mlx4_create_cq      cmd;
-	struct mlx4_create_cq_resp resp;
-	struct mlx4_cq		  *cq;
-	int			   ret;
-	struct mlx4_context	   *mctx = to_mctx(context);
+	struct mlx4_create_cq		cmd;
+	struct mlx4_exp_create_cq	cmd_e;
+	struct mlx4_create_cq_resp	resp;
+	struct mlx4_cq			*cq;
+	int				ret;
+	struct mlx4_context		*mctx = to_mctx(context);
+	int				thread_safe;
 
 	/* Sanity check CQ size before proceeding */
 	if (cqe > 0x3fffff)
 		return NULL;
 
-	cq = malloc(sizeof *cq);
+	cq = calloc(1, sizeof(*cq));
 	if (!cq)
 		return NULL;
 
 	cq->cons_index = 0;
+	cq->wait_index = 0;
+	cq->wait_count = 0;
 
-	if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
+	thread_safe = !mlx4_single_threaded;
+	if (attr && (attr->comp_mask & IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN)) {
+		if (!attr->res_domain) {
+			errno = EINVAL;
+			goto err;
+		}
+		thread_safe = (to_mres_domain(attr->res_domain)->attr.thread_model == IBV_EXP_THREAD_SAFE);
+	}
+
+	if (mlx4_lock_init(&cq->lock, thread_safe, mlx4_get_locktype()))
 		goto err;
 
+	cq->model_flags = thread_safe ? MLX4_CQ_MODEL_FLAG_THREAD_SAFE : 0;
+
 	cqe = align_queue_size(cqe + 1);
 
-	if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe, mctx->cqe_size))
+	if (mlx4_alloc_cq_buf(to_mctx(context), &cq->buf, cqe, mctx->cqe_size))
 		goto err;
 
 	cq->cqe_size = mctx->cqe_size;
-
 	cq->set_ci_db  = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
 	if (!cq->set_ci_db)
 		goto err_buf;
@@ -199,16 +828,41 @@
 	cq->arm_sn     = 1;
 	*cq->set_ci_db = 0;
 
-	cmd.buf_addr = (uintptr_t) cq->buf.buf;
-	cmd.db_addr  = (uintptr_t) cq->set_ci_db;
-
-	ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector,
-				&cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd,
-				&resp.ibv_resp, sizeof resp);
+	if (NULL != attr) {
+		cmd_e.buf_addr = (uintptr_t) cq->buf.buf;
+		cmd_e.db_addr  = (uintptr_t) cq->set_ci_db;
+	} else {
+		cmd.buf_addr = (uintptr_t) cq->buf.buf;
+		cmd.db_addr  = (uintptr_t) cq->set_ci_db;
+	}
+	if (NULL != attr) {
+		ret = ibv_exp_cmd_create_cq(context, cqe - 1, channel,
+					    comp_vector, &cq->ibv_cq,
+					    &cmd_e.ibv_cmd,
+					    sizeof(cmd_e.ibv_cmd),
+					    sizeof(cmd_e) - sizeof(cmd_e.ibv_cmd),
+					    &resp.ibv_resp,
+					    sizeof(resp.ibv_resp),
+					    sizeof(resp) - sizeof(resp.ibv_resp),
+					    attr);
+	} else {
+		ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector,
+					&cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd,
+					&resp.ibv_resp, sizeof(resp));
+	}
 	if (ret)
 		goto err_db;
 
 	cq->cqn = resp.cqn;
+	cq->stall_next_poll	= 0;
+	cq->stall_enable	= mctx->stall_enable;
+	if (NULL != attr && attr->comp_mask) {
+		if (cmd_e.ibv_cmd.comp_mask & IBV_EXP_CREATE_CQ_CAP_FLAGS) {
+			cq->creation_flags  = attr->flags;
+		}
+	}
+
+	cq->pattern = MLX4_CQ_PATTERN;
 
 	return &cq->ibv_cq;
 
@@ -216,14 +870,41 @@
 	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);
 
 err_buf:
-	mlx4_free_buf(&cq->buf);
-
+	if (cq->buf.hmem != NULL)
+		mlx4_free_buf_huge(to_mctx(context), &cq->buf);
+	else
+		mlx4_free_buf(&cq->buf);
 err:
 	free(cq);
 
 	return NULL;
 }
 
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+			      struct ibv_comp_channel *channel,
+			      int comp_vector)
+{
+	read_init_vars(to_mctx(context));
+	return create_cq(context, cqe, channel, comp_vector, NULL);
+}
+
+struct ibv_cq *mlx4_create_cq_ex(struct ibv_context *context,
+				    int cqe,
+				    struct ibv_comp_channel *channel,
+				    int comp_vector,
+				    struct ibv_exp_cq_init_attr *attr)
+{
+	return create_cq(context, cqe, channel, comp_vector, attr);
+}
+
+int mlx4_modify_cq(struct ibv_cq *cq,
+		   struct ibv_exp_cq_attr *attr,
+		   int attr_mask)
+{
+	struct ibv_exp_modify_cq cmd;
+	return ibv_exp_cmd_modify_cq(cq, attr, attr_mask, &cmd, sizeof(cmd));
+}
+
 int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
 {
 	struct mlx4_cq *cq = to_mcq(ibcq);
@@ -235,7 +916,7 @@
 	if (cqe > 0x3fffff)
 		return EINVAL;
 
-	pthread_spin_lock(&cq->lock);
+	mlx4_lock(&cq->lock);
 
 	cqe = align_queue_size(cqe + 1);
 	if (cqe == ibcq->cqe + 1) {
@@ -250,7 +931,7 @@
 		goto out;
 	}
 
-	ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe,
+	ret = mlx4_alloc_cq_buf(to_mctx(ibcq->context), &buf, cqe,
 					cq->cqe_size);
 	if (ret)
 		goto out;
@@ -268,17 +949,24 @@
 	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
 #endif
 	if (ret) {
-		mlx4_free_buf(&buf);
+		if (cq->buf.hmem != NULL)
+			mlx4_free_buf_huge(to_mctx(ibcq->context), &buf);
+		else
+			mlx4_free_buf(&buf);
 		goto out;
 	}
 
 	mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
 
-	mlx4_free_buf(&cq->buf);
-	cq->buf = buf;
+	if (cq->buf.hmem != NULL)
+		mlx4_free_buf_huge(to_mctx(ibcq->context), &cq->buf);
+	else
+		mlx4_free_buf(&cq->buf);
 
+	cq->buf = buf;
+	mlx4_update_cons_index(cq);
 out:
-	pthread_spin_unlock(&cq->lock);
+	mlx4_unlock(&cq->lock);
 	return ret;
 }
 
@@ -291,14 +979,32 @@
 		return ret;
 
 	mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
-	mlx4_free_buf(&to_mcq(cq)->buf);
+	if (to_mcq(cq)->buf.hmem != NULL)
+		mlx4_free_buf_huge(to_mctx(cq->context), &to_mcq(cq)->buf);
+	else
+		mlx4_free_buf(&to_mcq(cq)->buf);
 	free(to_mcq(cq));
 
 	return 0;
 }
 
+void *mlx4_get_legacy_xrc(struct ibv_srq *srq)
+{
+	struct mlx4_srq	*msrq = to_msrq(srq);
+
+	return msrq->ibv_srq_legacy;
+}
+
+void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq)
+{
+	struct mlx4_srq	*msrq = to_msrq(srq);
+
+	msrq->ibv_srq_legacy = legacy_xrc_srq;
+	return;
+}
+
 struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
-				 struct ibv_srq_init_attr *attr)
+				struct ibv_srq_init_attr *attr)
 {
 	struct mlx4_create_srq      cmd;
 	struct mlx4_create_srq_resp resp;
@@ -309,16 +1015,17 @@
 	if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
 		return NULL;
 
-	srq = malloc(sizeof *srq);
+	srq = calloc(1, sizeof *srq);
 	if (!srq)
 		return NULL;
 
-	if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+	if (mlx4_spinlock_init(&srq->lock, !mlx4_single_threaded))
 		goto err;
 
 	srq->max     = align_queue_size(attr->attr.max_wr + 1);
 	srq->max_gs  = attr->attr.max_sge;
 	srq->counter = 0;
+	srq->ext_srq = 0;
 
 	if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
 		goto err;
@@ -332,15 +1039,13 @@
 	cmd.buf_addr = (uintptr_t) srq->buf.buf;
 	cmd.db_addr  = (uintptr_t) srq->db;
 
-	ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr,
+	ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
 				 &cmd.ibv_cmd, sizeof cmd,
 				 &resp.ibv_resp, sizeof resp);
 	if (ret)
 		goto err_db;
 
-	srq->srqn = resp.srqn;
-
-	return &srq->ibv_srq;
+	return &srq->verbs_srq.srq;
 
 err_db:
 	mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
@@ -355,12 +1060,27 @@
 	return NULL;
 }
 
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+				   struct ibv_srq_init_attr_ex *attr_ex)
+{
+	if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
+	    (attr_ex->srq_type == IBV_SRQT_BASIC))
+		return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
+	else if (attr_ex->srq_type == IBV_SRQT_XRC)
+		return mlx4_create_xrc_srq(context, attr_ex);
+
+	return NULL;
+}
+
 int mlx4_modify_srq(struct ibv_srq *srq,
 		     struct ibv_srq_attr *attr,
-		     enum ibv_srq_attr_mask attr_mask)
+		     int attr_mask)
 {
 	struct ibv_modify_srq cmd;
 
+	if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
+		srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
+
 	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
 }
 
@@ -369,199 +1089,98 @@
 {
 	struct ibv_query_srq cmd;
 
+	if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
+		srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
+
 	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
 }
 
-int mlx4_destroy_srq(struct ibv_srq *ibsrq)
+int mlx4_destroy_srq(struct ibv_srq *srq)
 {
-	struct mlx4_srq *srq = to_msrq(ibsrq);
-	struct mlx4_cq *mcq = NULL;
 	int ret;
+	struct ibv_srq *legacy_srq = NULL;
 
-	if (ibsrq->xrc_cq) {
-		/* is an xrc_srq */
-		mcq = to_mcq(ibsrq->xrc_cq);
-		mlx4_cq_clean(mcq, 0, srq);
-		pthread_spin_lock(&mcq->lock);
-		mlx4_clear_xrc_srq(to_mctx(ibsrq->context), srq->srqn);
-		pthread_spin_unlock(&mcq->lock);
+	if (srq->handle == LEGACY_XRC_SRQ_HANDLE) {
+		legacy_srq = srq;
+		srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
 	}
 
-	ret = ibv_cmd_destroy_srq(ibsrq);
-	if (ret) {
-		if (ibsrq->xrc_cq) {
-			pthread_spin_lock(&mcq->lock);
-			mlx4_store_xrc_srq(to_mctx(ibsrq->context),
-					   srq->srqn, srq);
-			pthread_spin_unlock(&mcq->lock);
-		}
-		return ret;
+	if (to_msrq(srq)->ext_srq) {
+		ret =  mlx4_destroy_xrc_srq(srq);
+		if (ret)
+			return ret;
+
+		if (legacy_srq)
+			free(legacy_srq);
+
+		return 0;
 	}
 
-	mlx4_free_db(to_mctx(ibsrq->context), MLX4_DB_TYPE_RQ, srq->db);
-	mlx4_free_buf(&srq->buf);
-	free(srq->wrid);
-	free(srq);
+	ret = ibv_cmd_destroy_srq(srq);
+	if (ret)
+		return ret;
+
+	mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
+	mlx4_free_buf(&to_msrq(srq)->buf);
+	free(to_msrq(srq)->wrid);
+	free(to_msrq(srq));
 
 	return 0;
 }
 
-static int verify_sizes(struct ibv_qp_init_attr *attr, struct mlx4_context *context)
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+				 struct ibv_qp_init_attr_ex *attr)
 {
-	int size;
-	int nsegs;
-
-	if (attr->cap.max_send_wr     > context->max_qp_wr ||
-	    attr->cap.max_recv_wr     > context->max_qp_wr ||
-	    attr->cap.max_send_sge    > context->max_sge   ||
-	    attr->cap.max_recv_sge    > context->max_sge)
-		return -1;
-
-	if (attr->cap.max_inline_data) {
-		nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type);
-		size = MLX4_MAX_WQE_SIZE - nsegs * sizeof (struct mlx4_wqe_inline_seg);
-		switch (attr->qp_type) {
-		case IBV_QPT_UD:
-			size -= (sizeof (struct mlx4_wqe_ctrl_seg) +
-				 sizeof (struct mlx4_wqe_datagram_seg));
-			break;
-
-		case IBV_QPT_RC:
-		case IBV_QPT_UC:
-		case IBV_QPT_XRC:
-			size -= (sizeof (struct mlx4_wqe_ctrl_seg) +
-				 sizeof (struct mlx4_wqe_raddr_seg));
-			break;
-
-		default:
-			return 0;
-		}
-
-		if (attr->cap.max_inline_data > size)
-			return -1;
-	}
-
-	return 0;
+	read_init_vars(to_mctx(context));
+	return mlx4_exp_create_qp(context, (struct ibv_exp_qp_init_attr *)attr);
 }
 
 struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
 {
-	struct mlx4_create_qp     cmd;
-	struct ibv_create_qp_resp resp;
-	struct mlx4_qp		 *qp;
-	int			  ret;
-	struct mlx4_context	 *context = to_mctx(pd->context);
-
+	struct ibv_exp_qp_init_attr attr_exp;
+	struct ibv_qp *qp;
+	/* We should copy below only the shared fields excluding the xrc_domain field.
+	  * Otherwise we may have an ABI issue with applications that were compiled
+	  * without the xrc_domain field. The xrc_domain any way has no affect in
+	  * the sender side, no need to copy in/out.
+	*/
+	int init_attr_base_size = offsetof(struct ibv_qp_init_attr, xrc_domain);
+
+	/* copying only shared fields */
+	memcpy(&attr_exp, attr, init_attr_base_size);
+	attr_exp.comp_mask = IBV_EXP_QP_INIT_ATTR_PD;
+	attr_exp.pd = pd;
+	qp = mlx4_exp_create_qp(pd->context, &attr_exp);
+	if (qp)
+		memcpy(attr, &attr_exp, init_attr_base_size);
+	return qp;
+}
 
-	/* Sanity check QP size before proceeding */
-	if (verify_sizes(attr, context))
-		return NULL;
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr)
+{
+	struct ibv_open_qp cmd;
+	struct ibv_create_qp_resp resp;
+	struct mlx4_qp *qp;
+	int ret;
 
-	qp = malloc(sizeof *qp);
+	qp = calloc(1, sizeof *qp);
 	if (!qp)
 		return NULL;
 
-	mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
-
-	/*
-	 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
-	 * allow HW to prefetch.
-	 */
-	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
-	qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
-	qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
-
-	if (attr->srq || attr->qp_type == IBV_QPT_XRC)
-		attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
-	else {
-		if (attr->cap.max_recv_sge < 1)
-			attr->cap.max_recv_sge = 1;
-		if (attr->cap.max_recv_wr < 1)
-			attr->cap.max_recv_wr = 1;
-	}
-
-	if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp))
-		goto err;
-
-	mlx4_init_qp_indices(qp);
-
-	if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) ||
-	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
-		goto err_free;
-
-	if (!attr->srq && attr->qp_type != IBV_QPT_XRC) {
-		qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
-		if (!qp->db)
-			goto err_free;
-
-		*qp->db = 0;
-	}
-
-	cmd.buf_addr	    = (uintptr_t) qp->buf.buf;
-	if (attr->srq || attr->qp_type == IBV_QPT_XRC)
-		cmd.db_addr = 0;
-	else
-		cmd.db_addr = (uintptr_t) qp->db;
-	cmd.log_sq_stride   = qp->sq.wqe_shift;
-	for (cmd.log_sq_bb_count = 0;
-	     qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
-	     ++cmd.log_sq_bb_count)
-		; /* nothing */
-	cmd.sq_no_prefetch = 0;	/* OK for ABI 2: just a reserved field */
-	memset(cmd.reserved, 0, sizeof cmd.reserved);
-
-	pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex);
-
-	ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd,
-				&resp, sizeof resp);
-	if (ret)
-		goto err_rq_db;
-
-	ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp);
+	ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
+			      &cmd, sizeof cmd, &resp, sizeof resp);
 	if (ret)
-		goto err_destroy;
-	pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
-
-	qp->rq.wqe_cnt = attr->cap.max_recv_wr;
-	qp->rq.max_gs  = attr->cap.max_recv_sge;
-
-	/* adjust rq maxima to not exceed reported device maxima */
-	attr->cap.max_recv_wr = min(context->max_qp_wr, attr->cap.max_recv_wr);
-	attr->cap.max_recv_sge = min(context->max_sge, attr->cap.max_recv_sge);
-
-	qp->rq.max_post = attr->cap.max_recv_wr;
-	mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
-
-	qp->doorbell_qpn    = htonl(qp->ibv_qp.qp_num << 8);
-	if (attr->sq_sig_all)
-		qp->sq_signal_bits = htonl(MLX4_WQE_CTRL_CQ_UPDATE);
-	else
-		qp->sq_signal_bits = 0;
-
-	return &qp->ibv_qp;
-
-err_destroy:
-	ibv_cmd_destroy_qp(&qp->ibv_qp);
-
-err_rq_db:
-	pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
-	if (!attr->srq && attr->qp_type != IBV_QPT_XRC)
-		mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db);
+		goto err;
 
-err_free:
-	free(qp->sq.wrid);
-	if (qp->rq.wqe_cnt)
-		free(qp->rq.wrid);
-	mlx4_free_buf(&qp->buf);
+	return &qp->verbs_qp.qp;
 
 err:
 	free(qp);
-
 	return NULL;
 }
 
 int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
-		   enum ibv_qp_attr_mask attr_mask,
+		   int attr_mask,
 		   struct ibv_qp_init_attr *init_attr)
 {
 	struct ibv_query_qp cmd;
@@ -582,11 +1201,17 @@
 }
 
 int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
-		    enum ibv_qp_attr_mask attr_mask)
+		    int attr_mask)
 {
 	struct ibv_modify_qp cmd;
 	int ret;
 
+	if (attr_mask & IBV_QP_PORT) {
+		ret = update_port_data(qp, attr->port_num);
+		if (ret)
+			return ret;
+	}
+
 	if (qp->state == IBV_QPS_RESET &&
 	    attr_mask & IBV_QP_STATE   &&
 	    attr->qp_state == IBV_QPS_INIT) {
@@ -598,13 +1223,14 @@
 	if (!ret		       &&
 	    (attr_mask & IBV_QP_STATE) &&
 	    attr->qp_state == IBV_QPS_RESET) {
-		mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
-			       qp->srq ? to_msrq(qp->srq) : NULL);
-		if (qp->send_cq != qp->recv_cq)
+		if (qp->recv_cq)
+			mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+				      qp->srq ? to_msrq(qp->srq) : NULL);
+		if (qp->send_cq && qp->send_cq != qp->recv_cq)
 			mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
 
 		mlx4_init_qp_indices(to_mqp(qp));
-		if (!qp->srq && qp->qp_type != IBV_QPT_XRC)
+		if (to_mqp(qp)->rq.wqe_cnt)
 			*to_mqp(qp)->db = 0;
 	}
 
@@ -616,14 +1242,19 @@
 	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
 	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
 
-	if (send_cq == recv_cq)
-		pthread_spin_lock(&send_cq->lock);
-	else if (send_cq->cqn < recv_cq->cqn) {
-		pthread_spin_lock(&send_cq->lock);
-		pthread_spin_lock(&recv_cq->lock);
+	if (!qp->send_cq || !qp->recv_cq) {
+		if (qp->send_cq)
+			mlx4_lock(&send_cq->lock);
+		else if (qp->recv_cq)
+			mlx4_lock(&recv_cq->lock);
+	} else if (send_cq == recv_cq) {
+		mlx4_lock(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		mlx4_lock(&send_cq->lock);
+		mlx4_lock(&recv_cq->lock);
 	} else {
-		pthread_spin_lock(&recv_cq->lock);
-		pthread_spin_lock(&send_cq->lock);
+		mlx4_lock(&recv_cq->lock);
+		mlx4_lock(&send_cq->lock);
 	}
 }
 
@@ -632,14 +1263,20 @@
 	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
 	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
 
-	if (send_cq == recv_cq)
-		pthread_spin_unlock(&send_cq->lock);
-	else if (send_cq->cqn < recv_cq->cqn) {
-		pthread_spin_unlock(&recv_cq->lock);
-		pthread_spin_unlock(&send_cq->lock);
+
+	if (!qp->send_cq || !qp->recv_cq) {
+		if (qp->send_cq)
+			mlx4_unlock(&send_cq->lock);
+		else if (qp->recv_cq)
+			mlx4_unlock(&recv_cq->lock);
+	} else if (send_cq == recv_cq) {
+		mlx4_unlock(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		mlx4_unlock(&recv_cq->lock);
+		mlx4_unlock(&send_cq->lock);
 	} else {
-		pthread_spin_unlock(&send_cq->lock);
-		pthread_spin_unlock(&recv_cq->lock);
+		mlx4_unlock(&send_cq->lock);
+		mlx4_unlock(&recv_cq->lock);
 	}
 }
 
@@ -656,246 +1293,120 @@
 	}
 
 	mlx4_lock_cqs(ibqp);
-
-	__mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
-			ibqp->srq ? to_msrq(ibqp->srq) : NULL);
-	if (ibqp->send_cq != ibqp->recv_cq)
+	if (ibqp->recv_cq)
+		__mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
+				ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+	if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
 		__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
 
-	mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
+	if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
+		mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
 
 	mlx4_unlock_cqs(ibqp);
 	pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
 
-	if (!ibqp->srq && ibqp->qp_type != IBV_QPT_XRC)
-		mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
-	free(qp->sq.wrid);
+	/*
+	 * Use the qp->bf to check if the QP is using dedicated BF.
+	 * If so, update the dedicated BF database.
+	 */
+	if (qp->bf && (&qp->bf->cmn != &(to_mctx(ibqp->context)->bfs.cmn_bf))) {
+		struct mlx4_bfs_data *bfs = &to_mctx(ibqp->context)->bfs;
+		int idx = &(qp->bf->dedic) - bfs->dedic_bf;
+
+		if (0 <= idx && idx < (MLX4_MAX_BFS_IN_PAGE - 1)) {
+			mlx4_spin_lock(&bfs->dedic_bf_lock);
+			bfs->dedic_bf_used[idx] = 0;
+			bfs->dedic_bf_free++;
+			mlx4_spin_unlock(&bfs->dedic_bf_lock);
+		}
+	}
+
 	if (qp->rq.wqe_cnt)
-		free(qp->rq.wrid);
-	mlx4_free_buf(&qp->buf);
+		mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
+
+	mlx4_dealloc_qp_buf(ibqp->context, qp);
+
 	free(qp);
 
 	return 0;
 }
 
-struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
+struct ibv_ah *mlx4_create_ah_common(struct ibv_pd *pd,
+				     struct ibv_ah_attr *attr,
+				     uint8_t link_layer)
 {
 	struct mlx4_ah *ah;
-	struct ibv_port_attr port_attr;
-	uint8_t is_mcast;
+
+	if (unlikely(!attr->dlid) &&
+	    (link_layer != IBV_LINK_LAYER_ETHERNET)) {
+		errno = EINVAL;
+		return NULL;
+	}
 
 	ah = malloc(sizeof *ah);
 	if (!ah)
 		return NULL;
 
-	memset(ah, 0, sizeof *ah);
+	memset(&ah->av, 0, sizeof ah->av);
 
 	ah->av.port_pd   = htonl(to_mpd(pd)->pdn | (attr->port_num << 24));
-	ah->av.g_slid    = attr->src_path_bits;
-	ah->av.dlid      = htons(attr->dlid);
+
+	if (link_layer != IBV_LINK_LAYER_ETHERNET) {
+		ah->av.g_slid = attr->src_path_bits;
+		ah->av.dlid   = htons(attr->dlid);
+		ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28);
+	} else {
+		ah->vlan = ((attr->sl & 7) << 13);
+		ah->av.sl_tclass_flowlabel = htonl(attr->sl << 29);
+	}
+
 	if (attr->static_rate) {
 		ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
 		/* XXX check rate cap? */
 	}
-	ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28);
 	if (attr->is_global) {
 		ah->av.g_slid   |= 0x80;
 		ah->av.gid_index = attr->grh.sgid_index;
-		ah->av.hop_limit = attr->grh.hop_limit;
+		if (attr->grh.hop_limit < 2)
+			ah->av.hop_limit = 0xff;
+		else
+			ah->av.hop_limit = attr->grh.hop_limit;
 		ah->av.sl_tclass_flowlabel |=
 			htonl((attr->grh.traffic_class << 20) |
 				    attr->grh.flow_label);
 		memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
 	}
 
-	if (ibv_query_port(pd->context, attr->port_num, &port_attr))
-		goto err;
-
-	if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
-		if (ibv_resolve_eth_gid(pd, attr->port_num,
-					(union ibv_gid *)ah->av.dgid,
-					attr->grh.sgid_index,
-					ah->mac, &ah->vlan,
-					&ah->tagged, &is_mcast))
-			goto err;
-
-		if (is_mcast) {
-			ah->av.dlid = htons(0xc000);
-			ah->av.port_pd |= htonl(1 << 31);
-		}
-		if (ah->tagged) {
-			ah->av.port_pd |= htonl(1 << 29);
-			ah->vlan |= (attr->sl & 7) << 13;
-		}
-	}
-
-
 	return &ah->ibv_ah;
-err:
-	free(ah);
-	return NULL;
 }
 
-int mlx4_destroy_ah(struct ibv_ah *ah)
-{
-	free(to_mah(ah));
-
-	return 0;
-}
-
-#ifdef HAVE_IBV_XRC_OPS
-struct ibv_srq *mlx4_create_xrc_srq(struct ibv_pd *pd,
-				    struct ibv_xrc_domain *xrc_domain,
-				    struct ibv_cq *xrc_cq,
-				    struct ibv_srq_init_attr *attr)
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
 {
-	struct mlx4_create_xrc_srq  cmd;
-	struct mlx4_create_srq_resp resp;
-	struct mlx4_srq		   *srq;
-	int			    ret;
-
-	/* Sanity check SRQ size before proceeding */
-	if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
-		return NULL;
-
-	srq = malloc(sizeof *srq);
-	if (!srq)
-		return NULL;
-
-	if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
-		goto err;
-
-	srq->max     = align_queue_size(attr->attr.max_wr + 1);
-	srq->max_gs  = attr->attr.max_sge;
-	srq->counter = 0;
-
-	if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
-		goto err;
-
-	srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
-	if (!srq->db)
-		goto err_free;
-
-	*srq->db = 0;
-
-	cmd.buf_addr = (uintptr_t) srq->buf.buf;
-	cmd.db_addr  = (uintptr_t) srq->db;
-
-	ret = ibv_cmd_create_xrc_srq(pd, &srq->ibv_srq, attr,
-				     xrc_domain->handle,
-				     xrc_cq->handle,
-				     &cmd.ibv_cmd, sizeof cmd,
-				     &resp.ibv_resp, sizeof resp);
-	if (ret)
-		goto err_db;
+	struct ibv_ah *ah;
+	struct ibv_exp_port_attr port_attr;
+	struct ibv_port_attr port_attr_legacy;
+	uint8_t			link_layer;
 
-	srq->ibv_srq.xrc_srq_num = srq->srqn = resp.srqn;
+	port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1;
+	port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER;
 
-	ret = mlx4_store_xrc_srq(to_mctx(pd->context), srq->ibv_srq.xrc_srq_num, srq);
-	if (ret)
-		goto err_destroy;
-
-	return &srq->ibv_srq;
-
-err_destroy:
-	ibv_cmd_destroy_srq(&srq->ibv_srq);
-
-err_db:
-	mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
-
-err_free:
-	free(srq->wrid);
-	mlx4_free_buf(&srq->buf);
+	if (ibv_exp_query_port(pd->context, attr->port_num, &port_attr)) {
+		if (ibv_query_port(pd->context, attr->port_num, &port_attr_legacy))
+			return NULL;
 
-err:
-	free(srq);
-
-	return NULL;
-}
-
-struct ibv_xrc_domain *mlx4_open_xrc_domain(struct ibv_context *context,
-					    int fd, int oflag)
-{
-	int ret;
-	struct mlx4_open_xrc_domain_resp resp;
-	struct mlx4_xrc_domain *xrcd;
-
-	xrcd = malloc(sizeof *xrcd);
-	if (!xrcd)
-		return NULL;
-
-	ret = ibv_cmd_open_xrc_domain(context, fd, oflag, &xrcd->ibv_xrcd,
-				      &resp.ibv_resp, sizeof resp);
-	if (ret) {
-		free(xrcd);
-		return NULL;
+		link_layer = port_attr_legacy.link_layer;
+	} else {
+		link_layer = port_attr.link_layer;
 	}
 
-	xrcd->xrcdn = resp.xrcdn;
-	return &xrcd->ibv_xrcd;
-}
-
-int mlx4_close_xrc_domain(struct ibv_xrc_domain *d)
-{
-	int ret;
-	ret = ibv_cmd_close_xrc_domain(d);
-	if (!ret)
-		free(d);
-	return ret;
-}
-
-int mlx4_create_xrc_rcv_qp(struct ibv_qp_init_attr *init_attr,
-			   uint32_t *xrc_qp_num)
-{
-
-	return ibv_cmd_create_xrc_rcv_qp(init_attr, xrc_qp_num);
-}
+	ah = mlx4_create_ah_common(pd, attr, link_layer);
 
-int mlx4_modify_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			   uint32_t xrc_qp_num,
-			   struct ibv_qp_attr *attr,
-			   int attr_mask)
-{
-	return ibv_cmd_modify_xrc_rcv_qp(xrc_domain, xrc_qp_num,
-					 attr, attr_mask);
+	return ah;
 }
 
-int mlx4_query_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			  uint32_t xrc_qp_num,
-			  struct ibv_qp_attr *attr,
-			  int attr_mask,
-			  struct ibv_qp_init_attr *init_attr)
+int mlx4_destroy_ah(struct ibv_ah *ah)
 {
-	int ret;
-
-	ret = ibv_cmd_query_xrc_rcv_qp(xrc_domain, xrc_qp_num,
-				       attr, attr_mask, init_attr);
-	if (ret)
-		return ret;
-
-	init_attr->cap.max_send_wr = init_attr->cap.max_send_sge = 1;
-	init_attr->cap.max_recv_sge = init_attr->cap.max_recv_wr = 0;
-	init_attr->cap.max_inline_data = 0;
-	init_attr->recv_cq = init_attr->send_cq = NULL;
-	init_attr->srq = NULL;
-	init_attr->xrc_domain = xrc_domain;
-	init_attr->qp_type = IBV_QPT_XRC;
-	init_attr->qp_context = NULL;
-	attr->cap = init_attr->cap;
+	free(to_mah(ah));
 
 	return 0;
 }
-
-int mlx4_reg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			uint32_t xrc_qp_num)
-{
-	return ibv_cmd_reg_xrc_rcv_qp(xrc_domain, xrc_qp_num);
-}
-
-int mlx4_unreg_xrc_rcv_qp(struct ibv_xrc_domain *xrc_domain,
-			  uint32_t xrc_qp_num)
-{
-	return ibv_cmd_unreg_xrc_rcv_qp(xrc_domain, xrc_qp_num);
-}
-
-#endif
Index: contrib/ofed/libmlx4/src/verbs_exp.c
===================================================================
--- /dev/null
+++ contrib/ofed/libmlx4/src/verbs_exp.c
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+/* Added for reg_mr mmap munmap system calls */
+#include <sys/mman.h>
+#include "mlx4.h"
+#include "mlx4-abi.h"
+#include "mlx4_exp.h"
+#include "wqe.h"
+
+static const char *qptype2key(enum ibv_qp_type type)
+{
+	switch (type) {
+	case IBV_QPT_RC: return "HUGE_RC";
+	case IBV_QPT_UC: return "HUGE_UC";
+	case IBV_QPT_UD: return "HUGE_UD";
+#ifdef _NOT_EXISTS_IN_OFED_2_0
+	case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH";
+#endif
+
+	default: return "HUGE_NA";
+	}
+}
+
+static void update_qp_cap_cache(struct ibv_qp *qp)
+{
+	struct mlx4_context *ctx = to_mctx(qp->context);
+	struct mlx4_qp *mqp = to_mqp(qp);
+
+	if (((qp->qp_type == IBV_QPT_RAW_ETH) && (mqp->link_layer == IBV_LINK_LAYER_ETHERNET)) &&
+	    (ctx->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_IP_PKT))
+		mqp->qp_cap_cache |= MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP;
+}
+
+int update_port_data(struct ibv_qp *qp, uint8_t port_num)
+{
+	struct mlx4_qp *mqp = to_mqp(qp);
+	struct ibv_port_attr port_attr;
+	int err;
+
+	err = ibv_query_port(qp->context, port_num, &port_attr);
+	if (err)
+		return err;
+
+	mqp->link_layer = port_attr.link_layer;
+	update_qp_cap_cache(qp);
+
+	return 0;
+}
+
+int mlx4_exp_modify_qp(struct ibv_qp *qp, struct ibv_exp_qp_attr *attr,
+		       uint64_t attr_mask)
+{
+	struct ibv_exp_modify_qp cmd;
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	if (attr_mask & IBV_QP_PORT) {
+		ret = update_port_data(qp, attr->port_num);
+		if (ret)
+			return ret;
+	}
+
+	if (qp->state == IBV_QPS_RESET &&
+	    (attr_mask & IBV_EXP_QP_STATE) &&
+	    attr->qp_state == IBV_QPS_INIT) {
+		mlx4_qp_init_sq_ownership(to_mqp(qp));
+	}
+
+
+	ret = ibv_exp_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd));
+
+	if (!ret		       &&
+	    (attr_mask & IBV_EXP_QP_STATE) &&
+	    attr->qp_state == IBV_QPS_RESET) {
+		if (qp->recv_cq)
+			mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+				      qp->srq ? to_msrq(qp->srq) : NULL);
+		if (qp->send_cq && qp->send_cq != qp->recv_cq)
+			mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
+
+		mlx4_init_qp_indices(to_mqp(qp));
+		if (to_mqp(qp)->rq.wqe_cnt)
+			*to_mqp(qp)->db = 0;
+	}
+
+	return ret;
+}
+
+static int verify_sizes(struct ibv_exp_qp_init_attr *attr, struct mlx4_context *context)
+{
+	int size;
+	int nsegs;
+
+	if (attr->cap.max_send_wr     > context->max_qp_wr ||
+	    attr->cap.max_recv_wr     > context->max_qp_wr ||
+	    attr->cap.max_send_sge    > context->max_sge   ||
+	    attr->cap.max_recv_sge    > context->max_sge)
+		return -1;
+
+	if (attr->cap.max_inline_data) {
+		nsegs = num_inline_segs(attr->cap.max_inline_data, attr->qp_type);
+		size = MLX4_MAX_WQE_SIZE - nsegs * sizeof(struct mlx4_wqe_inline_seg);
+		switch (attr->qp_type) {
+		case IBV_QPT_UD:
+			size -= (sizeof(struct mlx4_wqe_ctrl_seg) +
+				 sizeof(struct mlx4_wqe_datagram_seg));
+			break;
+
+		case IBV_QPT_RC:
+		case IBV_QPT_UC:
+			size -= (sizeof(struct mlx4_wqe_ctrl_seg) +
+				 sizeof(struct mlx4_wqe_raddr_seg));
+			break;
+
+		default:
+			return 0;
+		}
+
+		if (attr->cap.max_inline_data > size)
+			return -1;
+	}
+
+	return 0;
+}
+
+static int mlx4_exp_alloc_qp_buf(struct ibv_context *context,
+				 struct ibv_exp_qp_init_attr *attr,
+				 struct mlx4_qp *qp)
+{
+	int ret;
+	enum mlx4_alloc_type alloc_type;
+	enum mlx4_alloc_type default_alloc_type = MLX4_ALLOC_TYPE_PREFER_CONTIG;
+	const char *qp_huge_key;
+	int i, wqe_size;
+
+	qp->rq.max_gs = attr->cap.max_recv_sge;
+	wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
+	if ((attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) && (attr->max_inl_recv)) {
+		qp->max_inlr_sg = qp->rq.max_gs;
+		wqe_size = max(wqe_size, attr->max_inl_recv);
+	}
+	for (qp->rq.wqe_shift = 4; 1 << qp->rq.wqe_shift < wqe_size; qp->rq.wqe_shift++)
+		; /* nothing */
+
+	if (qp->max_inlr_sg) {
+		attr->max_inl_recv = 1 << qp->rq.wqe_shift;
+		qp->max_inlr_sg = attr->max_inl_recv / sizeof(struct mlx4_wqe_data_seg);
+	}
+
+	if (qp->sq.wqe_cnt) {
+		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
+		if (!qp->sq.wrid)
+			return -1;
+	}
+
+	if (qp->rq.wqe_cnt) {
+		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t));
+		if (!qp->rq.wrid) {
+			free(qp->sq.wrid);
+			return -1;
+		}
+
+		if (qp->max_inlr_sg) {
+			qp->inlr_buff.buff = malloc(qp->rq.wqe_cnt * sizeof(*(qp->inlr_buff.buff)));
+			if (!qp->inlr_buff.buff) {
+				free(qp->sq.wrid);
+				free(qp->rq.wrid);
+				return -1;
+			}
+			qp->inlr_buff.len = qp->rq.wqe_cnt;
+			qp->inlr_buff.buff[0].sg_list = malloc(qp->rq.wqe_cnt *
+							       sizeof(*(qp->inlr_buff.buff->sg_list)) *
+							       qp->max_inlr_sg);
+			if (!qp->inlr_buff.buff->sg_list) {
+				free(qp->sq.wrid);
+				free(qp->rq.wrid);
+				free(qp->inlr_buff.buff);
+				return -1;
+			}
+			for (i = 1; i < qp->rq.wqe_cnt; i++)
+				qp->inlr_buff.buff[i].sg_list = &qp->inlr_buff.buff[0].sg_list[i * qp->max_inlr_sg];
+		}
+	}
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+	if (qp->buf_size) {
+		/* compatability support */
+		qp_huge_key  = qptype2key(attr->qp_type);
+		if (mlx4_use_huge(context, qp_huge_key))
+			default_alloc_type = MLX4_ALLOC_TYPE_HUGE;
+
+
+		mlx4_get_alloc_type(context, MLX4_QP_PREFIX, &alloc_type,
+				    default_alloc_type);
+
+		ret = mlx4_alloc_prefered_buf(to_mctx(context), &qp->buf,
+				align(qp->buf_size, to_mdev
+				(context->device)->page_size),
+				to_mdev(context->device)->page_size,
+				alloc_type,
+				MLX4_QP_PREFIX);
+
+		if (ret) {
+			free(qp->sq.wrid);
+			free(qp->rq.wrid);
+			if (qp->max_inlr_sg) {
+				free(qp->inlr_buff.buff[0].sg_list);
+				free(qp->inlr_buff.buff);
+			}
+			return -1;
+		}
+
+		memset(qp->buf.buf, 0, qp->buf_size);
+		if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+			qp->rq.buf = qp->buf.buf;
+			qp->sq.buf = qp->buf.buf + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+		} else {
+			qp->rq.buf = qp->buf.buf + (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+			qp->sq.buf = qp->buf.buf;
+		}
+
+	} else {
+		qp->buf.buf = NULL;
+	}
+
+	return 0;
+}
+
+static uint64_t send_db_to_uar(uintptr_t send_db)
+{
+	return (send_db - MLX4_SEND_DOORBELL);
+}
+
+static uint32_t *uar_to_send_db(uintptr_t uar)
+{
+	return (uint32_t *)(uar + MLX4_SEND_DOORBELL);
+}
+
+static void update_qp_bf_data(struct mlx4_res_domain *res_domain,
+			      struct mlx4_qp *qp, struct ibv_context *context)
+{
+	switch (res_domain->type) {
+	case MLX4_RES_DOMAIN_BF_SAFE:
+		qp->db_method = MLX4_QP_DB_METHOD_BF;
+		break;
+	case MLX4_RES_DOMAIN_BF_UNSAFE:
+		qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
+		break;
+	case MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT:
+		if (to_mctx(context)->prefer_bf)
+			qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB;
+		else
+			qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+		break;
+	default:
+		break;
+	}
+	qp->bf = &res_domain->send_db->bf;
+	qp->sdb = res_domain->send_db->db_addr;
+	qp->bf_buf_size = to_mctx(context)->bfs.buf_size;
+}
+
+struct ibv_qp *mlx4_exp_create_qp(struct ibv_context *context,
+				  struct ibv_exp_qp_init_attr *attr)
+{
+	struct mlx4_qp		 *qp;
+	int			  ret;
+	union {
+		struct mlx4_create_qp		basic;
+		struct mlx4_exp_create_qp	extended;
+	} cmd_obj;
+	union {
+		struct ibv_create_qp_resp	basic;
+		struct ibv_exp_create_qp_resp	extended;
+	} resp_obj;
+	struct mlx4_create_qp_base *cmd = NULL;
+	int ext_kernel_cmd = 0;
+	struct mlx4_bfs_data *bfs = &to_mctx(context)->bfs;
+	int i;
+	unsigned char cq_update;
+	int thread_safe = !mlx4_single_threaded;
+	int db_method_defined = 0;
+
+	memset(&resp_obj, 0, sizeof(resp_obj));
+	memset(&cmd_obj, 0, sizeof(cmd_obj));
+
+	if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_RESERVED1) {
+		errno = ENOSYS;
+		return NULL;
+	}
+
+	if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV) {
+		if (attr->srq)
+			attr->max_inl_recv = 0;
+		else
+			attr->max_inl_recv = min(attr->max_inl_recv,
+						 (to_mctx(context)->max_sge *
+						 sizeof(struct mlx4_wqe_data_seg)));
+	}
+
+	/* Sanity check QP size before proceeding */
+	if (verify_sizes(attr, to_mctx(context)))
+		return NULL;
+
+	if (attr->qp_type == IBV_QPT_XRC && attr->recv_cq &&
+		attr->cap.max_recv_wr > 0 && mlx4_trace)
+		fprintf(stderr, PFX "Warning: Legacy XRC sender should not use a recieve cq\n");
+
+	qp = calloc(1, sizeof(*qp));
+	if (!qp)
+		return NULL;
+
+	qp->qp_cap_cache = 0;
+	if (attr->comp_mask >= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS)
+		ext_kernel_cmd = 1;
+	if (attr->qp_type == IBV_QPT_XRC_RECV) {
+		attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
+	} else {
+		if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG &&
+		    attr->max_atomic_arg != 0) {
+			if (attr->max_atomic_arg == 8) {
+				qp->is_masked_atomic = 1;
+			} else {
+				fprintf(stderr, "%s: max_atomic_arg = %d is not valid for mlx4 (use 8 or 0)\n",
+					__FUNCTION__, attr->max_atomic_arg);
+				errno = EINVAL;
+				goto err;
+			}
+		}
+
+		mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
+		/*
+		 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+#ifdef MLX4_WQE_FORMAT
+		qp->sq_spare_wqes = 0;
+#else
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+#endif
+		qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
+	}
+
+	if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
+	    attr->qp_type == IBV_QPT_XRC_RECV ||
+	    attr->qp_type == IBV_QPT_XRC) {
+		attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
+		if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_INL_RECV)
+			attr->max_inl_recv = 0;
+	} else {
+		qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
+		if (attr->cap.max_recv_sge < 1)
+			attr->cap.max_recv_sge = 1;
+		if (attr->cap.max_recv_wr < 1)
+			attr->cap.max_recv_wr = 1;
+	}
+
+	if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS)
+		qp->create_flags = attr->exp_create_flags & IBV_EXP_QP_CREATE_MASK;
+
+	if (mlx4_exp_alloc_qp_buf(context, attr, qp))
+		goto err;
+
+	mlx4_init_qp_indices(qp);
+
+	qp->sdb = (uint32_t *) (to_mctx(context)->uar + MLX4_SEND_DOORBELL);
+	if (attr->comp_mask & IBV_EXP_QP_INIT_ATTR_RES_DOMAIN) {
+		struct mlx4_res_domain *rd;
+
+		if (!attr->res_domain) {
+			errno = EINVAL;
+			goto err_free;
+		}
+		rd = to_mres_domain(attr->res_domain);
+		if (rd->attr.thread_model == IBV_EXP_THREAD_UNSAFE ||
+		    rd->attr.thread_model == IBV_EXP_THREAD_SINGLE)
+			thread_safe = 0;
+
+		if (rd->send_db) {
+			cmd_obj.extended.exp_cmd.uar_virt_add = send_db_to_uar((uintptr_t)rd->send_db->db_addr);
+			update_qp_bf_data(rd, qp, context);
+			db_method_defined = 1;
+		}
+	}
+
+	if (mlx4_lock_init(&qp->sq.lock, thread_safe, mlx4_get_locktype()))
+		goto err_free;
+	if (mlx4_lock_init(&qp->rq.lock, thread_safe, mlx4_get_locktype()))
+		goto sq_lock_destroy;
+
+	cmd = (ext_kernel_cmd ?
+			&cmd_obj.extended.exp_cmd.base : &cmd_obj.basic.base);
+
+	if (attr->cap.max_recv_sge) {
+		qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+		if (!qp->db)
+			goto rq_lock_destroy;
+
+		*qp->db = 0;
+		cmd->db_addr = (uintptr_t) qp->db;
+	} else {
+		cmd->db_addr = 0;
+	}
+
+	cmd->buf_addr	    = (uintptr_t) qp->buf.buf;
+	cmd->log_sq_stride   = qp->sq.wqe_shift;
+	for (cmd->log_sq_bb_count = 0;
+	     qp->sq.wqe_cnt > 1 << cmd->log_sq_bb_count;
+	     ++cmd->log_sq_bb_count)
+		; /* nothing */
+	cmd->sq_no_prefetch = 0;	/* OK for ABI 2: just a reserved field */
+	memset(cmd->reserved, 0, sizeof(cmd->reserved));
+
+	pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
+	ret = ibv_exp_cmd_create_qp(context, &qp->verbs_qp,
+				    sizeof(qp->verbs_qp), attr,
+				    ext_kernel_cmd ?
+				    (void *)&cmd_obj.extended.ibv_cmd :
+				    (void *)&cmd_obj.basic.ibv_cmd,
+				    ext_kernel_cmd ?
+				    sizeof(cmd_obj.extended.ibv_cmd) :
+				    sizeof(cmd_obj.basic.ibv_cmd),
+				    ext_kernel_cmd ?
+				    sizeof(cmd_obj.extended.exp_cmd) :
+				    sizeof(cmd_obj.basic.base),
+				    ext_kernel_cmd ?
+				    (void *)&resp_obj.extended : (void *)&resp_obj.basic,
+				    ext_kernel_cmd ?
+				    sizeof(resp_obj.extended) :
+				    sizeof(resp_obj.basic),
+				    0, 0);
+	if (ret) {
+		errno = ret;
+		goto err_rq_db;
+	}
+
+	if (qp->max_inlr_sg && (attr->max_inl_recv != (1 << qp->rq.wqe_shift)))
+		goto err_destroy;
+
+	if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
+		ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
+		if (ret)
+			goto err_destroy;
+	}
+	pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+
+	qp->rq.wqe_cnt = attr->cap.max_recv_wr;
+	qp->rq.max_gs  = attr->cap.max_recv_sge;
+
+	/* adjust rq maxima to not exceed reported device maxima */
+	attr->cap.max_recv_wr = min(to_mctx(context)->max_qp_wr,
+					attr->cap.max_recv_wr);
+	attr->cap.max_recv_sge = min(to_mctx(context)->max_sge,
+					attr->cap.max_recv_sge);
+
+	qp->rq.max_post = attr->cap.max_recv_wr;
+	if (attr->qp_type != IBV_QPT_XRC_RECV)
+		mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
+
+	qp->doorbell_qpn    = htonl(qp->verbs_qp.qp.qp_num << 8);
+	if (attr->sq_sig_all)
+		cq_update = MLX4_WQE_CTRL_CQ_UPDATE;
+	else
+		cq_update = 0;
+
+	/*
+	 * The rcrb_flags_tbl is a table to get the right value for the first
+	 * byte of srcrb_flags field on the WQE ctrl segment.
+	 * The value is derived from the QP sq_sig_all flag and the 4 WR flags
+	 * IBV_EXP_SEND_SIGNALED, IBV_EXP_SEND_SOLICITED, IBV_EXP_SEND_IP_CSUM
+	 * and IBV_EXP_SEND_TUNNEL.
+	 * These flags used as an index to get the required value from the table.
+	 * The IBV_EXP_SEND_SIGNALED flag defines first bit of the index the
+	 * IBV_EXP_SEND_SOLICITED defines the second bit the IBV_EXP_SEND_IP_CSUM
+	 * defines the third bit and IBV_EXP_SEND_TUNNEL the fourth one.
+	 * Therefore to calculate the index we can use:
+	 *	idx = (exp_send_flags & IBV_EXP_SEND_SIGNALED)/IBV_EXP_SEND_SIGNALED |
+	 *	      (exp_send_flags & IBV_EXP_SEND_SOLICITED)/(IBV_EXP_SEND_SOLICITED >> 1) |
+	 *	      (exp_send_flags & IBV_EXP_SEND_IP_CSUM)/(IBV_EXP_SEND_IP_CSUM >> 2);
+	 *	      (exp_send_flags & IBV_EXP_SEND_TUNNEL)/(IBV_EXP_SEND_TUNNEL >> 3);
+	 */
+	qp->srcrb_flags_tbl[0] = cq_update;
+	qp->srcrb_flags_tbl[1] = MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+	qp->srcrb_flags_tbl[2] = MLX4_WQE_CTRL_SOLICIT | cq_update;
+	qp->srcrb_flags_tbl[3] = MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+	qp->srcrb_flags_tbl[4] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | cq_update;
+	qp->srcrb_flags_tbl[5] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+	qp->srcrb_flags_tbl[6] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_SOLICIT | cq_update;
+	qp->srcrb_flags_tbl[7] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+	qp->srcrb_flags_tbl[8] = cq_update;
+	qp->srcrb_flags_tbl[9] = MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+	qp->srcrb_flags_tbl[10] = MLX4_WQE_CTRL_SOLICIT | cq_update;
+	qp->srcrb_flags_tbl[11] = MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+	qp->srcrb_flags_tbl[12] = MLX4_WQE_CTRL_IP_CSUM | cq_update;
+	qp->srcrb_flags_tbl[13] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | cq_update;
+	qp->srcrb_flags_tbl[14] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_SOLICIT | cq_update;
+	qp->srcrb_flags_tbl[15] = MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICIT | cq_update;
+
+	qp->qp_type = attr->qp_type;
+
+	/* Set default value of cached RX csum flags to 0 */
+	qp->cached_rx_csum_flags = 0;
+	/* Set transposed_rx_csum_flags to match the cached_rx_csum_flags = 0 */
+	qp->transposed_rx_csum_flags = IBV_EXP_CQ_RX_OUTER_IPV6_PACKET;
+
+	if (!db_method_defined && bfs->buf_size == 0) {
+		/* not using BF */
+		qp->db_method = MLX4_QP_DB_METHOD_DB;
+	} else if (!db_method_defined) {
+		/*
+		 * To gain performance the dedic_bf_free is first tested without taking
+		 * the dedic_bf_lock.
+		 */
+		if (bfs->dedic_bf_free) {
+			mlx4_spin_lock(&bfs->dedic_bf_lock);
+			for (i = 0 ; i < bfs->num_dedic_bfs; i++) {
+				if (!bfs->dedic_bf_used[i]) {
+					/* using dedicated BF */
+					qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
+					qp->bf = (union mlx4_bf *)(&bfs->dedic_bf[i]);
+					bfs->dedic_bf_used[i] = 1;
+					bfs->dedic_bf_free--;
+					break;
+				}
+			}
+			mlx4_spin_unlock(&bfs->dedic_bf_lock);
+		}
+		if (!qp->bf) {
+			/* using common BF */
+			if (mlx4_single_threaded)
+				qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF;
+			else
+				qp->db_method = MLX4_QP_DB_METHOD_BF;
+			qp->bf = (union mlx4_bf *)(&bfs->cmn_bf);
+		}
+		if (qp->db_method == MLX4_QP_DB_METHOD_DEDIC_BF &&
+		    mlx4_single_threaded && (wc_auto_evict_size() == 64)) {
+			if (to_mctx(context)->prefer_bf)
+				qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_PB;
+			else
+				qp->db_method = MLX4_QP_DB_METHOD_DEDIC_BF_1_THREAD_WC_EVICT_NPB;
+		}
+		qp->bf_buf_size = bfs->buf_size;
+	}
+
+	qp->model_flags = thread_safe ? MLX4_QP_MODEL_FLAG_THREAD_SAFE : 0;
+	mlx4_update_post_send_one(qp);
+	qp->pattern = MLX4_QP_PATTERN;
+
+	return &qp->verbs_qp.qp;
+
+err_destroy:
+	ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
+
+err_rq_db:
+	pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+	if (attr->cap.max_recv_sge)
+		mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
+
+rq_lock_destroy:
+	mlx4_lock_destroy(&qp->rq.lock);
+
+sq_lock_destroy:
+	mlx4_lock_destroy(&qp->sq.lock);
+
+err_free:
+	mlx4_dealloc_qp_buf(context, qp);
+
+err:
+	free(qp);
+
+	return NULL;
+}
+
+int mlx4_exp_query_device(struct ibv_context *context,
+			  struct ibv_exp_device_attr *device_attr)
+{
+	struct ibv_exp_query_device cmd;
+	struct ibv_port_attr port_attr;
+	uint64_t raw_fw_ver;
+	int ret;
+	int i;
+
+	ret = ibv_exp_cmd_query_device(context, device_attr, &raw_fw_ver,
+				       &cmd, sizeof(cmd));
+	if (ret)
+		return ret;
+
+	if (device_attr->exp_device_cap_flags & IBV_EXP_DEVICE_CROSS_CHANNEL) {
+		device_attr->comp_mask |= IBV_EXP_DEVICE_ATTR_CALC_CAP;
+		device_attr->calc_cap.data_types = (1ULL << IBV_EXP_CALC_DATA_TYPE_INT) |
+						   (1ULL << IBV_EXP_CALC_DATA_TYPE_UINT) |
+						   (1ULL << IBV_EXP_CALC_DATA_TYPE_FLOAT);
+		device_attr->calc_cap.data_sizes = (1ULL << IBV_EXP_CALC_DATA_SIZE_64_BIT);
+		device_attr->calc_cap.int_ops = (1ULL << IBV_EXP_CALC_OP_ADD) |
+						(1ULL << IBV_EXP_CALC_OP_BAND) |
+						(1ULL << IBV_EXP_CALC_OP_BXOR) |
+						(1ULL << IBV_EXP_CALC_OP_BOR);
+		device_attr->calc_cap.uint_ops = device_attr->calc_cap.int_ops;
+		device_attr->calc_cap.fp_ops = device_attr->calc_cap.int_ops;
+	}
+	device_attr->exp_device_cap_flags |= IBV_EXP_DEVICE_MR_ALLOCATE;
+
+	if ((device_attr->comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS) &&
+	    (device_attr->exp_device_cap_flags & (IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT |
+						  IBV_EXP_DEVICE_RX_CSUM_IP_PKT |
+						  IBV_EXP_DEVICE_VXLAN_SUPPORT))) {
+		for (i = 0; i < device_attr->phys_port_cnt; i++) {
+			ret = mlx4_query_port(context, i + 1, &port_attr);
+			if (ret)
+				return ret;
+
+			if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+				device_attr->exp_device_cap_flags &= ~(IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT |
+								       IBV_EXP_DEVICE_RX_CSUM_IP_PKT |
+								       IBV_EXP_DEVICE_VXLAN_SUPPORT);
+				break;
+			}
+		}
+	}
+
+	return __mlx4_query_device(
+			raw_fw_ver,
+			(struct ibv_device_attr *)device_attr);
+}
+
+int mlx4_exp_query_port(struct ibv_context *context, uint8_t port_num,
+			struct ibv_exp_port_attr *port_attr)
+{
+	/* Check that only valid flags were given */
+	if (!(port_attr->comp_mask & IBV_EXP_QUERY_PORT_ATTR_MASK1) ||
+	    (port_attr->comp_mask & ~IBV_EXP_QUERY_PORT_ATTR_MASKS) ||
+	    (port_attr->mask1 & ~IBV_EXP_QUERY_PORT_MASK)) {
+		return EINVAL;
+	}
+
+	/* Optimize the link type query */
+	if (port_attr->comp_mask == IBV_EXP_QUERY_PORT_ATTR_MASK1) {
+		if (!(port_attr->mask1 & ~(IBV_EXP_QUERY_PORT_LINK_LAYER |
+					   IBV_EXP_QUERY_PORT_CAP_FLAGS))) {
+			struct mlx4_context *mctx = to_mctx(context);
+			if (port_num <= 0 || port_num > MLX4_PORTS_NUM)
+				return EINVAL;
+			if (mctx->port_query_cache[port_num - 1].valid) {
+				if (port_attr->mask1 &
+				    IBV_EXP_QUERY_PORT_LINK_LAYER)
+					port_attr->link_layer =
+						mctx->
+						port_query_cache[port_num - 1].
+						link_layer;
+				if (port_attr->mask1 &
+				    IBV_EXP_QUERY_PORT_CAP_FLAGS)
+					port_attr->port_cap_flags =
+						mctx->
+						port_query_cache[port_num - 1].
+						caps;
+				return 0;
+			}
+		}
+		if (port_attr->mask1 & IBV_EXP_QUERY_PORT_STD_MASK) {
+			return mlx4_query_port(context, port_num,
+					       &port_attr->port_attr);
+		}
+	}
+
+	return EOPNOTSUPP;
+}
+
+struct ibv_ah *mlx4_exp_create_ah(struct ibv_pd *pd,
+				  struct ibv_exp_ah_attr *attr_ex)
+{
+	struct ibv_exp_port_attr port_attr;
+	struct ibv_ah *ah;
+	struct mlx4_ah *mah;
+
+	port_attr.comp_mask = IBV_EXP_QUERY_PORT_ATTR_MASK1;
+	port_attr.mask1 = IBV_EXP_QUERY_PORT_LINK_LAYER;
+
+	if (ibv_exp_query_port(pd->context, attr_ex->port_num, &port_attr))
+		return NULL;
+
+	ah = mlx4_create_ah_common(pd, (struct ibv_ah_attr *)attr_ex,
+				   port_attr.link_layer);
+
+	if (NULL == ah)
+		return NULL;
+
+	mah = to_mah(ah);
+
+	/* If vlan was given, check that we could use it */
+	if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID &&
+	    attr_ex->vid <= 0xfff &&
+	    (0 == attr_ex->ll_address.len ||
+	     !(attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL)))
+		goto err;
+
+	/* ll_address.len == 0 means no ll address given */
+	if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_LL &&
+	    0 != attr_ex->ll_address.len) {
+		if (LL_ADDRESS_ETH != attr_ex->ll_address.type ||
+		    port_attr.link_layer != IBV_LINK_LAYER_ETHERNET)
+			/* mlx4 provider currently only support ethernet
+			 * extensions */
+			goto err;
+
+		/* link layer is ethernet */
+		if (6 != attr_ex->ll_address.len ||
+		    NULL == attr_ex->ll_address.address)
+			goto err;
+
+		memcpy(mah->mac, attr_ex->ll_address.address,
+		       attr_ex->ll_address.len);
+
+		if (attr_ex->comp_mask & IBV_EXP_AH_ATTR_VID &&
+		    attr_ex->vid <= 0xfff) {
+				mah->av.port_pd |= htonl(1 << 29);
+				mah->vlan = attr_ex->vid |
+					((attr_ex->sl & 7) << 13);
+		}
+	}
+
+	return ah;
+
+err:
+	free(ah);
+	return NULL;
+}
+
+static struct mlx4_send_db_data *allocate_send_db(struct mlx4_context *ctx)
+{
+	struct mlx4_device *dev = to_mdev(ctx->ibv_ctx.device);
+	struct mlx4_send_db_data *send_db = NULL;
+	unsigned int uar_idx;
+	void *uar;
+	void *bfs;
+	int i;
+
+	if (!ctx->max_ctx_res_domain || !ctx->bfs.buf_size) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	mlx4_spin_lock(&ctx->send_db_lock);
+	if (!list_empty(&ctx->send_db_list)) {
+		send_db = list_entry(ctx->send_db_list.next, struct mlx4_send_db_data, list);
+		list_del(&send_db->list);
+	}
+	mlx4_spin_unlock(&ctx->send_db_lock);
+
+	if (!send_db) {
+		/* Fill up more send_db objects */
+		mlx4_spin_lock(&ctx->send_db_lock);
+		if ((ctx->send_db_num_uars + 1) * ctx->bf_regs_per_page >= ctx->max_ctx_res_domain) {
+			mlx4_spin_unlock(&ctx->send_db_lock);
+			errno = ENOMEM;
+			return NULL;
+		}
+		uar_idx = ctx->send_db_num_uars;
+		ctx->send_db_num_uars++;
+		mlx4_spin_unlock(&ctx->send_db_lock);
+
+		uar = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED,
+			   ctx->ibv_ctx.cmd_fd,
+			   dev->page_size * (MLX4_IB_EXP_MMAP_EXT_UAR_PAGE |
+					     (uar_idx << MLX4_MMAP_CMD_BITS)));
+		if (uar == MAP_FAILED)
+			return NULL;
+		bfs = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED,
+			   ctx->ibv_ctx.cmd_fd,
+			   dev->page_size * (MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE |
+					     (uar_idx << MLX4_MMAP_CMD_BITS)));
+		if (bfs == MAP_FAILED) {
+			munmap(uar, dev->page_size);
+			return NULL;
+		}
+		mlx4_spin_lock(&ctx->send_db_lock);
+		for (i = 0; i < ctx->bf_regs_per_page; i++) {
+			send_db = calloc(1, sizeof(*send_db));
+			if (!send_db) {
+				if (i)
+					break;
+				mlx4_spin_unlock(&ctx->send_db_lock);
+				errno = ENOMEM;
+				return NULL;
+			}
+
+			mlx4_lock_init(&send_db->bf.cmn.lock,
+				       !mlx4_single_threaded,
+				       mlx4_get_locktype());
+
+			send_db->db_addr = uar_to_send_db((uintptr_t)uar);
+
+			/* Allocate a pair of blue-flames to toggle sends between them */
+			send_db->bf.cmn.address = bfs + (i * ctx->bfs.buf_size * 2);
+			list_add(&send_db->list, &ctx->send_db_list);
+		}
+
+		/* Return the last send_db object to the caller */
+		list_del(&send_db->list);
+		mlx4_spin_unlock(&ctx->send_db_lock);
+	}
+
+	return send_db;
+}
+
+static void free_send_db(struct mlx4_context *ctx,
+			 struct mlx4_send_db_data *send_db)
+{
+	mlx4_spin_lock(&ctx->send_db_lock);
+	list_add(&send_db->list, &ctx->send_db_list);
+	mlx4_spin_unlock(&ctx->send_db_lock);
+}
+
+struct ibv_exp_res_domain *mlx4_exp_create_res_domain(struct ibv_context *context,
+						      struct ibv_exp_res_domain_init_attr *attr)
+{
+	struct mlx4_context *ctx = to_mctx(context);
+	struct mlx4_res_domain *res_domain;
+
+	if (attr->comp_mask >= IBV_EXP_RES_DOMAIN_RESERVED) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	res_domain = calloc(1, sizeof(*res_domain));
+	if (!res_domain) {
+		errno = ENOMEM;
+		return NULL;
+	}
+
+	res_domain->ibv_res_domain.context = context;
+
+	/* set default values */
+	res_domain->attr.thread_model = IBV_EXP_THREAD_SAFE;
+	res_domain->attr.msg_model = IBV_EXP_MSG_DEFAULT;
+	/* get requested valid values */
+	if (attr->comp_mask & IBV_EXP_RES_DOMAIN_THREAD_MODEL)
+		res_domain->attr.thread_model = attr->thread_model;
+	if (attr->comp_mask & IBV_EXP_RES_DOMAIN_MSG_MODEL)
+		res_domain->attr.msg_model = attr->msg_model;
+	res_domain->attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL |
+				     IBV_EXP_RES_DOMAIN_MSG_MODEL;
+	/*
+	 * Allocate BF for every resource domain since BF is improving
+	 * both BW and latency of single message.
+	 */
+	res_domain->send_db = allocate_send_db(ctx);
+
+	/* define resource domain type */
+	if (!res_domain->send_db) {
+		if (res_domain->attr.msg_model == IBV_EXP_MSG_FORCE_LOW_LATENCY)
+			/*
+			 * Fail in case user asked for force low-latency
+			 * resource-domain but we can't allocate
+			 * dedicated BF.
+			 */
+			goto err;
+		else
+			/*
+			 * Dedicated BF is not allocated for the
+			 * resource-domain.
+			 */
+			res_domain->type = MLX4_RES_DOMAIN_BF_NONE;
+	} else {
+		/*
+		 * In case dedicated BF allocated set the
+		 * resource-domain type according to the
+		 * thread-model
+		 */
+		switch (res_domain->attr.thread_model) {
+		case IBV_EXP_THREAD_SAFE:
+			res_domain->type = MLX4_RES_DOMAIN_BF_SAFE;
+			break;
+		case IBV_EXP_THREAD_UNSAFE:
+			res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE;
+			break;
+		case IBV_EXP_THREAD_SINGLE:
+			if (wc_auto_evict_size() == 64)
+				res_domain->type = MLX4_RES_DOMAIN_BF_SINGLE_WC_EVICT;
+			else
+				res_domain->type = MLX4_RES_DOMAIN_BF_UNSAFE;
+			break;
+		}
+	}
+
+	return &res_domain->ibv_res_domain;
+
+err:
+	free(res_domain);
+
+	return NULL;
+}
+
+int mlx4_exp_destroy_res_domain(struct ibv_context *context,
+				struct ibv_exp_res_domain *res_dom,
+				struct ibv_exp_destroy_res_domain_attr *attr)
+{
+	struct mlx4_res_domain *res_domain = to_mres_domain(res_dom);
+
+	if (res_domain->send_db)
+		free_send_db(to_mctx(context), res_domain->send_db);
+
+	free(res_domain);
+
+	return 0;
+}
+
+void *mlx4_exp_query_intf(struct ibv_context *context, struct ibv_exp_query_intf_params *params,
+			  enum ibv_exp_query_intf_status *status)
+{
+	void *family = NULL;
+	struct mlx4_qp *qp;
+	struct mlx4_cq *cq;
+
+	*status = IBV_EXP_INTF_STAT_OK;
+
+	if (!params->obj) {
+		errno = EINVAL;
+		*status = IBV_EXP_INTF_STAT_INVAL_OBJ;
+
+		return NULL;
+	}
+
+	if (params->intf_version > MLX4_MAX_FAMILY_VER) {
+		*status = IBV_EXP_INTF_STAT_VERSION_NOT_SUPPORTED;
+
+		return NULL;
+	}
+
+	switch (params->intf) {
+	case IBV_EXP_INTF_QP_BURST:
+		qp = to_mqp(params->obj);
+		if (qp->pattern == MLX4_QP_PATTERN) {
+			family = mlx4_get_qp_burst_family(qp, params, status);
+			if (*status != IBV_EXP_INTF_STAT_OK) {
+				fprintf(stderr, PFX "Failed to get QP burst family\n");
+				errno = EINVAL;
+			}
+		} else {
+			fprintf(stderr, PFX "Warning: non-valid QP passed to query interface\n");
+			*status = IBV_EXP_INTF_STAT_INVAL_OBJ;
+			errno = EINVAL;
+		}
+		break;
+
+	case IBV_EXP_INTF_CQ:
+		cq = to_mcq(params->obj);
+		if (cq->pattern == MLX4_CQ_PATTERN) {
+			family = (void *)mlx4_get_poll_cq_family(cq, params, status);
+		} else {
+			fprintf(stderr, PFX "Warning: non-valid CQ passed to query interface\n");
+			*status = IBV_EXP_INTF_STAT_INVAL_OBJ;
+			errno = EINVAL;
+		}
+		break;
+
+	default:
+		*status = IBV_EXP_INTF_STAT_INTF_NOT_SUPPORTED;
+		errno = EINVAL;
+	}
+
+	return family;
+}
+
+int mlx4_exp_release_intf(struct ibv_context *context, void *intf,
+			  struct ibv_exp_release_intf_params *params)
+{
+	return 0;
+}
Index: contrib/ofed/libmlx4/src/wqe.h
===================================================================
--- contrib/ofed/libmlx4/src/wqe.h
+++ contrib/ofed/libmlx4/src/wqe.h
@@ -38,9 +38,19 @@
 };
 
 enum {
-	MLX4_WQE_CTRL_FENCE	= 1 << 6,
-	MLX4_WQE_CTRL_CQ_UPDATE	= 3 << 2,
-	MLX4_WQE_CTRL_SOLICIT	= 1 << 1,
+	MLX4_WQE_CTRL_FENCE		= 1 << 6,
+	MLX4_WQE_CTRL_CQ_UPDATE		= 3 << 2,
+	MLX4_WQE_CTRL_SOLICIT		= 1 << 1,
+	MLX4_WQE_CTRL_STRONG_ORDER      = 1 << 7,
+	MLX4_WQE_CTRL_IIP		= 1 << 28,
+	MLX4_WQE_CTRL_IL4		= 1 << 27,
+	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
+	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
+};
+
+enum {
+	MLX4_WQE_BIND_TYPE_2     = (1<<31),
+	MLX4_WQE_BIND_ZERO_BASED = (1<<30),
 };
 
 enum {
@@ -54,8 +64,7 @@
 
 struct mlx4_wqe_ctrl_seg {
 	uint32_t		owner_opcode;
-	uint16_t                vlan_tag;
-	uint8_t                 ins_vlan;
+	uint8_t			reserved[3];
 	uint8_t			fence_size;
 	/*
 	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
@@ -66,7 +75,10 @@
 	 * [1]   SE (solicited event)
 	 * [0]   FL (force loopback)
 	 */
-	uint32_t		xrcrb_flags;
+	union {
+		uint32_t srcrb_flags;
+		uint16_t srcrb_flags16[2];
+	};
 	/*
 	 * imm is immediate data for send/RDMA write w/ immediate;
 	 * also invalidation key for send with invalidate; input
@@ -99,6 +111,19 @@
 	uint32_t		reserved2[3];
 };
 
+struct mlx4_wqe_local_inval_seg {
+	uint64_t		reserved1;
+	uint32_t		mem_key;
+	uint32_t		reserved2;
+	uint64_t		reserved3[2];
+};
+
+enum {
+	MLX4_WQE_MW_REMOTE_READ   = 1 << 29,
+	MLX4_WQE_MW_REMOTE_WRITE  = 1 << 30,
+	MLX4_WQE_MW_ATOMIC        = 1 << 31
+};
+
 struct mlx4_wqe_raddr_seg {
 	uint64_t		raddr;
 	uint32_t		rkey;
@@ -110,6 +135,13 @@
 	uint64_t		compare;
 };
 
+struct mlx4_wqe_masked_atomic_seg {
+	uint64_t	swap_data;
+	uint64_t	cmp_data;
+	uint64_t	swap_mask;
+	uint64_t	cmp_mask;
+};
+
 struct mlx4_wqe_bind_seg {
 	uint32_t		flags1;
 	uint32_t		flags2;
@@ -119,4 +151,11 @@
 	uint64_t		length;
 };
 
+struct mlx4_wqe_wait_en_seg {
+	uint32_t valid;
+	uint32_t resv;
+	uint32_t pi;
+	uint32_t obj_num;
+};
+
 #endif /* WQE_H */
Index: contrib/ofed/usr.lib/libmlx4/Makefile
===================================================================
--- contrib/ofed/usr.lib/libmlx4/Makefile
+++ contrib/ofed/usr.lib/libmlx4/Makefile
@@ -14,7 +14,7 @@
 SHLIB_MAJOR=	1
 MK_PROFILE=	no
 
-SRCS=	buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c
+SRCS=	buf.c cq.c dbrec.c mlx4.c qp.c srq.c verbs.c verbs_exp.c
 
 LIBADD=	ibverbs pthread
 CFLAGS+= -DHAVE_CONFIG_H 
Index: contrib/ofed/usr.lib/libmlx4/config.h
===================================================================
--- contrib/ofed/usr.lib/libmlx4/config.h
+++ contrib/ofed/usr.lib/libmlx4/config.h
@@ -1,4 +1,3 @@
-#define	HAVE_IBV_DONTFORK_RANGE
-#define	HAVE_IBV_DOFORK_RANGE
-#define	HAVE_IBV_REGISTER_DRIVER
-#define	HAVE_IBV_READ_SYSFS_FILE
+#define HAVE_IBV_DOFORK_RANGE 1
+#define HAVE_IBV_DONTFORK_RANGE 1
+#define HAVE_IBV_REGISTER_DRIVER 1