Index: head/sys/compat/linuxkpi/common/include/linux/completion.h
===================================================================
--- head/sys/compat/linuxkpi/common/include/linux/completion.h	(revision 320071)
+++ head/sys/compat/linuxkpi/common/include/linux/completion.h	(revision 320072)
@@ -1,70 +1,69 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef	_LINUX_COMPLETION_H_
 #define	_LINUX_COMPLETION_H_
 
 #include <linux/errno.h>
-#include <linux/wait.h>
 
 struct completion {
 	unsigned int done;
 };
 
 #define	INIT_COMPLETION(c) \
 	((c).done = 0)
 #define	init_completion(c) \
 	do { (c)->done = 0; } while (0)
 #define	reinit_completion(c) \
 	do { (c)->done = 0; } while (0)
 #define	complete(c)				\
 	linux_complete_common((c), 0)
 #define	complete_all(c)				\
 	linux_complete_common((c), 1)
 #define	wait_for_completion(c)			\
 	linux_wait_for_common((c), 0)
 #define	wait_for_completion_interuptible(c)	\
 	linux_wait_for_common((c), 1)
 #define	wait_for_completion_timeout(c, timeout)	\
 	linux_wait_for_timeout_common((c), (timeout), 0)
 #define	wait_for_completion_interruptible_timeout(c, timeout)	\
 	linux_wait_for_timeout_common((c), (timeout), 1)
 #define	try_wait_for_completion(c) \
 	linux_try_wait_for_completion(c)
 #define	completion_done(c) \
 	linux_completion_done(c)
 
 extern void linux_complete_common(struct completion *, int);
 extern long linux_wait_for_common(struct completion *, int);
 extern long linux_wait_for_timeout_common(struct completion *, long, int);
 extern int linux_try_wait_for_completion(struct completion *);
 extern int linux_completion_done(struct completion *);
 
 #endif					/* _LINUX_COMPLETION_H_ */
Index: head/sys/compat/linuxkpi/common/include/linux/kernel.h
===================================================================
--- head/sys/compat/linuxkpi/common/include/linux/kernel.h	(revision 320071)
+++ head/sys/compat/linuxkpi/common/include/linux/kernel.h	(revision 320072)
@@ -1,442 +1,441 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2016 Mellanox Technologies, Ltd.
  * Copyright (c) 2014-2015 François Tigeot
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef	_LINUX_KERNEL_H_
 #define	_LINUX_KERNEL_H_
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/param.h>
 #include <sys/libkern.h>
 #include <sys/stat.h>
 #include <sys/smp.h>
 #include <sys/stddef.h>
 #include <sys/syslog.h>
 #include <sys/time.h>
 
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/jiffies.h>
-#include <linux/wait.h>
 #include <linux/log2.h> 
 #include <asm/byteorder.h>
 
 #include <machine/stdarg.h>
 
 #define KERN_CONT       ""
 #define	KERN_EMERG	"<0>"
 #define	KERN_ALERT	"<1>"
 #define	KERN_CRIT	"<2>"
 #define	KERN_ERR	"<3>"
 #define	KERN_WARNING	"<4>"
 #define	KERN_NOTICE	"<5>"
 #define	KERN_INFO	"<6>"
 #define	KERN_DEBUG	"<7>"
 
 #define	U8_MAX		((u8)~0U)
 #define	S8_MAX		((s8)(U8_MAX >> 1))
 #define	S8_MIN		((s8)(-S8_MAX - 1))
 #define	U16_MAX		((u16)~0U)
 #define	S16_MAX		((s16)(U16_MAX >> 1))
 #define	S16_MIN		((s16)(-S16_MAX - 1))
 #define	U32_MAX		((u32)~0U)
 #define	S32_MAX		((s32)(U32_MAX >> 1))
 #define	S32_MIN		((s32)(-S32_MAX - 1))
 #define	U64_MAX		((u64)~0ULL)
 #define	S64_MAX		((s64)(U64_MAX >> 1))
 #define	S64_MIN		((s64)(-S64_MAX - 1))
 
 #define	S8_C(x)  x
 #define	U8_C(x)  x ## U
 #define	S16_C(x) x
 #define	U16_C(x) x ## U
 #define	S32_C(x) x
 #define	U32_C(x) x ## U
 #define	S64_C(x) x ## LL
 #define	U64_C(x) x ## ULL
 
 #define	BUILD_BUG_ON(x)			CTASSERT(!(x))
 #define	BUILD_BUG_ON_MSG(x, msg)	BUILD_BUG_ON(x)
 #define	BUILD_BUG_ON_NOT_POWER_OF_2(x)	BUILD_BUG_ON(!powerof2(x))
 
 #define	BUG()			panic("BUG at %s:%d", __FILE__, __LINE__)
 #define	BUG_ON(cond)		do {				\
 	if (cond) {						\
 		panic("BUG ON %s failed at %s:%d",		\
 		    __stringify(cond), __FILE__, __LINE__);	\
 	}							\
 } while (0)
 
 #define	WARN_ON(cond) ({					\
       bool __ret = (cond);					\
       if (__ret) {						\
 		printf("WARNING %s failed at %s:%d\n",		\
 		    __stringify(cond), __FILE__, __LINE__);	\
       }								\
       unlikely(__ret);						\
 })
 
 #define	WARN_ON_SMP(cond)	WARN_ON(cond)
 
 #define	WARN_ON_ONCE(cond) ({					\
       static bool __warn_on_once;				\
       bool __ret = (cond);					\
       if (__ret && !__warn_on_once) {				\
 		__warn_on_once = 1;				\
 		printf("WARNING %s failed at %s:%d\n",		\
 		    __stringify(cond), __FILE__, __LINE__);	\
       }								\
       unlikely(__ret);						\
 })
 
 #define	oops_in_progress	SCHEDULER_STOPPED()
 
 #undef	ALIGN
 #define	ALIGN(x, y)		roundup2((x), (y))
 #undef PTR_ALIGN
 #define	PTR_ALIGN(p, a)		((__typeof(p))ALIGN((uintptr_t)(p), (a)))
 #define	DIV_ROUND_UP(x, n)	howmany(x, n)
 #define	DIV_ROUND_UP_ULL(x, n)	DIV_ROUND_UP((unsigned long long)(x), (n))
 #define	FIELD_SIZEOF(t, f)	sizeof(((t *)0)->f)
 
 #define	printk(...)		printf(__VA_ARGS__)
 #define	vprintk(f, a)		vprintf(f, a)
 
 struct va_format {
 	const char *fmt;
 	va_list *va;
 };
 
 static inline int
 vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
 {
 	ssize_t ssize = size;
 	int i;
 
 	i = vsnprintf(buf, size, fmt, args);
 
 	return ((i >= ssize) ? (ssize - 1) : i);
 }
 
 static inline int
 scnprintf(char *buf, size_t size, const char *fmt, ...)
 {
 	va_list args;
 	int i;
 
 	va_start(args, fmt);
 	i = vscnprintf(buf, size, fmt, args);
 	va_end(args);
 
 	return (i);
 }
 
 /*
  * The "pr_debug()" and "pr_devel()" macros should produce zero code
  * unless DEBUG is defined:
  */
 #ifdef DEBUG
 #define pr_debug(fmt, ...) \
         log(LOG_DEBUG, fmt, ##__VA_ARGS__)
 #define pr_devel(fmt, ...) \
 	log(LOG_DEBUG, pr_fmt(fmt), ##__VA_ARGS__)
 #else
 #define pr_debug(fmt, ...) \
         ({ if (0) log(LOG_DEBUG, fmt, ##__VA_ARGS__); 0; })
 #define pr_devel(fmt, ...) \
 	({ if (0) log(LOG_DEBUG, pr_fmt(fmt), ##__VA_ARGS__); 0; })
 #endif
 
 #ifndef pr_fmt
 #define pr_fmt(fmt) fmt
 #endif
 
 /*
  * Print a one-time message (analogous to WARN_ONCE() et al):
  */
 #define printk_once(...) do {			\
 	static bool __print_once;		\
 						\
 	if (!__print_once) {			\
 		__print_once = true;		\
 		printk(__VA_ARGS__);		\
 	}					\
 } while (0)
 
 /*
  * Log a one-time message (analogous to WARN_ONCE() et al):
  */
 #define log_once(level,...) do {		\
 	static bool __log_once;			\
 						\
 	if (unlikely(!__log_once)) {		\
 		__log_once = true;		\
 		log(level, __VA_ARGS__);	\
 	}					\
 } while (0)
 
 #define pr_emerg(fmt, ...) \
 	log(LOG_EMERG, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_alert(fmt, ...) \
 	log(LOG_ALERT, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_crit(fmt, ...) \
 	log(LOG_CRIT, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_err(fmt, ...) \
 	log(LOG_ERR, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_warning(fmt, ...) \
 	log(LOG_WARNING, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_warn(...) \
 	pr_warning(__VA_ARGS__)
 #define pr_warn_once(fmt, ...) \
 	log_once(LOG_WARNING, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_notice(fmt, ...) \
 	log(LOG_NOTICE, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info(fmt, ...) \
 	log(LOG_INFO, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info_once(fmt, ...) \
 	log_once(LOG_INFO, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_cont(fmt, ...) \
 	printk(KERN_CONT fmt, ##__VA_ARGS__)
 #define	pr_warn_ratelimited(...) do {		\
 	static linux_ratelimit_t __ratelimited;	\
 	if (linux_ratelimited(&__ratelimited))	\
 		pr_warning(__VA_ARGS__);	\
 } while (0)
 
 #ifndef WARN
 #define	WARN(condition, ...) ({			\
         bool __ret_warn_on = (condition);	\
         if (unlikely(__ret_warn_on))		\
                 pr_warning(__VA_ARGS__);	\
         unlikely(__ret_warn_on);		\
 })
 #endif
 
 #ifndef WARN_ONCE
 #define	WARN_ONCE(condition, ...) ({		\
         bool __ret_warn_on = (condition);	\
         if (unlikely(__ret_warn_on))		\
                 pr_warn_once(__VA_ARGS__);	\
         unlikely(__ret_warn_on);		\
 })
 #endif
 
 #define container_of(ptr, type, member)				\
 ({								\
 	const __typeof(((type *)0)->member) *__p = (ptr);	\
 	(type *)((uintptr_t)__p - offsetof(type, member));	\
 })
   
 #define	ARRAY_SIZE(x)	(sizeof(x) / sizeof((x)[0]))
 
 static inline unsigned long long
 simple_strtoull(const char *cp, char **endp, unsigned int base)
 {
 	return (strtouq(cp, endp, base));
 }
 
 static inline long long
 simple_strtoll(const char *cp, char **endp, unsigned int base)
 {
 	return (strtoq(cp, endp, base));
 }
 
 static inline unsigned long
 simple_strtoul(const char *cp, char **endp, unsigned int base)
 {
 	return (strtoul(cp, endp, base));
 }
 
 static inline long
 simple_strtol(const char *cp, char **endp, unsigned int base)
 {
 	return (strtol(cp, endp, base));
 }
 
 static inline int
 kstrtoul(const char *cp, unsigned int base, unsigned long *res)
 {
 	char *end;
 
 	*res = strtoul(cp, &end, base);
 
 	if (*cp == 0 || *end != 0)
 		return (-EINVAL);
 	return (0);
 }
 
 static inline int
 kstrtol(const char *cp, unsigned int base, long *res)
 {
 	char *end;
 
 	*res = strtol(cp, &end, base);
 
 	if (*cp == 0 || *end != 0)
 		return (-EINVAL);
 	return (0);
 }
 
 static inline int
 kstrtoint(const char *cp, unsigned int base, int *res)
 {
 	char *end;
 	long temp;
 
 	*res = temp = strtol(cp, &end, base);
 
 	if (*cp == 0 || *end != 0)
 		return (-EINVAL);
 	if (temp != (int)temp)
 		return (-ERANGE);
 	return (0);
 }
 
 static inline int
 kstrtouint(const char *cp, unsigned int base, unsigned int *res)
 {
 	char *end;
 	unsigned long temp;
 
 	*res = temp = strtoul(cp, &end, base);
 
 	if (*cp == 0 || *end != 0)
 		return (-EINVAL);
 	if (temp != (unsigned int)temp)
 		return (-ERANGE);
 	return (0);
 }
 
 static inline int
 kstrtou32(const char *cp, unsigned int base, u32 *res)
 {
 	char *end;
 	unsigned long temp;
 
 	*res = temp = strtoul(cp, &end, base);
 
 	if (*cp == 0 || *end != 0)
 		return (-EINVAL);
 	if (temp != (u32)temp)
 		return (-ERANGE);
 	return (0);
 }
 
 #define min(x, y)	((x) < (y) ? (x) : (y))
 #define max(x, y)	((x) > (y) ? (x) : (y))
 
 #define min3(a, b, c)	min(a, min(b,c))
 #define max3(a, b, c)	max(a, max(b,c))
 
 #define	min_t(type, x, y) ({			\
 	type __min1 = (x);			\
 	type __min2 = (y);			\
 	__min1 < __min2 ? __min1 : __min2; })
 
 #define	max_t(type, x, y) ({			\
 	type __max1 = (x);			\
 	type __max2 = (y);			\
 	__max1 > __max2 ? __max1 : __max2; })
 
 #define clamp_t(type, _x, min, max)	min_t(type, max_t(type, _x, min), max)
 #define clamp(x, lo, hi)		min( max(x,lo), hi)
 #define	clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
 
 /*
  * This looks more complex than it should be. But we need to
  * get the type for the ~ right in round_down (it needs to be
  * as wide as the result!), and we want to evaluate the macro
  * arguments just once each.
  */
 #define __round_mask(x, y) ((__typeof__(x))((y)-1))
 #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
 #define round_down(x, y) ((x) & ~__round_mask(x, y))
 
 #define	smp_processor_id()	PCPU_GET(cpuid)
 #define	num_possible_cpus()	mp_ncpus
 #define	num_online_cpus()	mp_ncpus
 
 #if defined(__i386__) || defined(__amd64__)
 extern bool linux_cpu_has_clflush;
 #define	cpu_has_clflush		linux_cpu_has_clflush
 #endif
 
 typedef struct pm_message {
         int event;
 } pm_message_t;
 
 /* Swap values of a and b */
 #define swap(a, b) do {			\
 	typeof(a) _swap_tmp = a;	\
 	a = b;				\
 	b = _swap_tmp;			\
 } while (0)
 
 #define	DIV_ROUND_CLOSEST(x, divisor)	(((x) + ((divisor) / 2)) / (divisor))
 
 #define	DIV_ROUND_CLOSEST_ULL(x, divisor) ({		\
 	__typeof(divisor) __d = (divisor);		\
 	unsigned long long __ret = (x) + (__d) / 2;	\
 	__ret /= __d;					\
 	__ret;						\
 })
 
 static inline uintmax_t
 mult_frac(uintmax_t x, uintmax_t multiplier, uintmax_t divisor)
 {
 	uintmax_t q = (x / divisor);
 	uintmax_t r = (x % divisor);
 
 	return ((q * multiplier) + ((r * multiplier) / divisor));
 }
 
 static inline int64_t
 abs64(int64_t x)
 {
 	return (x < 0 ? -x : x);
 }
 
 typedef struct linux_ratelimit {
 	struct timeval lasttime;
 	int counter;
 } linux_ratelimit_t;
 
 static inline bool
 linux_ratelimited(linux_ratelimit_t *rl)
 {
 	return (ppsratecheck(&rl->lasttime, &rl->counter, 1));
 }
 
 #endif	/* _LINUX_KERNEL_H_ */
Index: head/sys/compat/linuxkpi/common/include/linux/kobject.h
===================================================================
--- head/sys/compat/linuxkpi/common/include/linux/kobject.h	(revision 320071)
+++ head/sys/compat/linuxkpi/common/include/linux/kobject.h	(revision 320072)
@@ -1,148 +1,149 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef	_LINUX_KOBJECT_H_
 #define	_LINUX_KOBJECT_H_
 
 #include <machine/stdarg.h>
 
 #include <linux/kernel.h>
 #include <linux/kref.h>
+#include <linux/list.h>
 #include <linux/slab.h>
 
 struct kobject;
 struct sysctl_oid;
 
 struct kobj_type {
 	void (*release)(struct kobject *kobj);
 	const struct sysfs_ops *sysfs_ops;
 	struct attribute **default_attrs;
 };
 
 extern const struct kobj_type linux_kfree_type;
 
 struct kobject {
 	struct kobject		*parent;
 	char			*name;
 	struct kref		kref;
 	const struct kobj_type	*ktype;
 	struct list_head	entry;
 	struct sysctl_oid	*oidp;
 };
 
 extern struct kobject *mm_kobj;
 
 struct attribute {
 	const char 	*name;
 	struct module	*owner;
 	mode_t		mode;
 };
 
 struct kobj_attribute {
         struct attribute attr;
         ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
                         char *buf);
         ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t count);
 };
 
 static inline void
 kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
 {
 
 	kref_init(&kobj->kref);
 	INIT_LIST_HEAD(&kobj->entry);
 	kobj->ktype = ktype;
 	kobj->oidp = NULL;
 }
 
 void linux_kobject_release(struct kref *kref);
 
 static inline void
 kobject_put(struct kobject *kobj)
 {
 
 	if (kobj)
 		kref_put(&kobj->kref, linux_kobject_release);
 }
 
 static inline struct kobject *
 kobject_get(struct kobject *kobj)
 {
 
 	if (kobj)
 		kref_get(&kobj->kref);
 	return kobj;
 }
 
 int	kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list);
 int	kobject_add(struct kobject *kobj, struct kobject *parent,
 	    const char *fmt, ...);
 
 static inline struct kobject *
 kobject_create(void)
 {
 	struct kobject *kobj;
 
 	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
 	if (kobj == NULL)
 		return (NULL);
 	kobject_init(kobj, &linux_kfree_type);
 
 	return (kobj);
 }
 
 static inline struct kobject *
 kobject_create_and_add(const char *name, struct kobject *parent)
 {
 	struct kobject *kobj;
 
 	kobj = kobject_create();
 	if (kobj == NULL)
 		return (NULL);
 	if (kobject_add(kobj, parent, "%s", name) == 0)
 		return (kobj);
 	kobject_put(kobj);
 
 	return (NULL);
 }
 
 static inline char *
 kobject_name(const struct kobject *kobj)
 {
 
 	return kobj->name;
 }
 
 int	kobject_set_name(struct kobject *kobj, const char *fmt, ...);
 int	kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
 	    struct kobject *parent, const char *fmt, ...);
 
 #endif /* _LINUX_KOBJECT_H_ */
Index: head/sys/compat/linuxkpi/common/include/linux/mm_types.h
===================================================================
--- head/sys/compat/linuxkpi/common/include/linux/mm_types.h	(revision 320071)
+++ head/sys/compat/linuxkpi/common/include/linux/mm_types.h	(revision 320072)
@@ -1,69 +1,68 @@
 /*-
  * Copyright (c) 2017 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _LINUX_MM_TYPES_H_
 #define	_LINUX_MM_TYPES_H_
 
 #include <linux/types.h>
-#include <linux/list.h>
 #include <linux/page.h>
 #include <linux/rwsem.h>
 
 #include <asm/atomic.h>
 
 struct vm_area_struct;
 struct task_struct;
 
 struct mm_struct {
 	struct vm_area_struct *mmap;
 	atomic_t mm_count;
 	atomic_t mm_users;
 	size_t pinned_vm;
 	struct rw_semaphore mmap_sem;
 };
 
 extern void linux_mm_dtor(struct mm_struct *mm);
 
 static inline void
 mmdrop(struct mm_struct *mm)
 {
 	if (__predict_false(atomic_dec_and_test(&mm->mm_count)))
 		linux_mm_dtor(mm);
 }
 
 static inline void
 mmput(struct mm_struct *mm)
 {
 	if (__predict_false(atomic_dec_and_test(&mm->mm_users)))
 		mmdrop(mm);
 }
 
 extern struct mm_struct *linux_get_task_mm(struct task_struct *);
 #define	get_task_mm(task) linux_get_task_mm(task)
 
 #endif					/* _LINUX_MM_TYPES_H_ */
Index: head/sys/compat/linuxkpi/common/include/linux/sched.h
===================================================================
--- head/sys/compat/linuxkpi/common/include/linux/sched.h	(revision 320071)
+++ head/sys/compat/linuxkpi/common/include/linux/sched.h	(revision 320072)
@@ -1,151 +1,150 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2017 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef	_LINUX_SCHED_H_
 #define	_LINUX_SCHED_H_
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 
-#include <linux/list.h>
 #include <linux/compat.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
 #include <linux/slab.h>
 #include <linux/mm_types.h>
 #include <linux/string.h>
 #include <linux/bitmap.h>
 
 #include <asm/atomic.h>
 
 #define	MAX_SCHEDULE_TIMEOUT	INT_MAX
 
 #define	TASK_RUNNING		0x0000
 #define	TASK_INTERRUPTIBLE	0x0001
 #define	TASK_UNINTERRUPTIBLE	0x0002
 #define	TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
 #define	TASK_WAKING		0x0100
 
 struct task_struct {
 	struct thread *task_thread;
 	struct mm_struct *mm;
 	linux_task_fn_t *task_fn;
 	void   *task_data;
 	int	task_ret;
 	atomic_t usage;
 	int	state;
 	atomic_t kthread_flags;
 	pid_t	pid;	/* BSD thread ID */
 	const char    *comm;
 	void   *bsd_ioctl_data;
 	unsigned bsd_ioctl_len;
 	struct completion parked;
 	struct completion exited;
 	TAILQ_ENTRY(task_struct) rcu_entry;
 	int rcu_recurse;
 };
 
 #define	current	({ \
 	struct thread *__td = curthread; \
 	linux_set_current(__td); \
 	((struct task_struct *)__td->td_lkpi_task); \
 })
 
 #define	task_pid_group_leader(task) (task)->task_thread->td_proc->p_pid
 #define	task_pid(task)		((task)->pid)
 #define	task_pid_nr(task)	((task)->pid)
 #define	get_pid(x)		(x)
 #define	put_pid(x)		do { } while (0)
 #define	current_euid()	(curthread->td_ucred->cr_uid)
 
 #define	set_task_state(task, x)		\
 	atomic_store_rel_int((volatile int *)&task->state, (x))
 #define	__set_task_state(task, x)	(task->state = (x))
 #define	set_current_state(x)		set_task_state(current, x)
 #define	__set_current_state(x)		__set_task_state(current, x)
 
 static inline void
 get_task_struct(struct task_struct *task)
 {
 	atomic_inc(&task->usage);
 }
 
 static inline void
 put_task_struct(struct task_struct *task)
 {
 	if (atomic_dec_and_test(&task->usage))
 		linux_free_current(task);
 }
 
 #define	cond_resched()	if (!cold)	sched_relinquish(curthread)
 
 #define	yield()		kern_yield(PRI_UNCHANGED)
 #define	sched_yield()	sched_relinquish(curthread)
 
 #define	need_resched() (curthread->td_flags & TDF_NEEDRESCHED)
 
 bool linux_signal_pending(struct task_struct *task);
 bool linux_fatal_signal_pending(struct task_struct *task);
 bool linux_signal_pending_state(long state, struct task_struct *task);
 void linux_send_sig(int signo, struct task_struct *task);
 
 #define	signal_pending(task)		linux_signal_pending(task)
 #define	fatal_signal_pending(task)	linux_fatal_signal_pending(task)
 #define	signal_pending_state(state, task)		\
 	linux_signal_pending_state(state, task)
 #define	send_sig(signo, task, priv) do {		\
 	CTASSERT(priv == 0);				\
 	linux_send_sig(signo, task);			\
 } while (0)
 
 int linux_schedule_timeout(int timeout);
 
 #define	schedule()					\
 	(void)linux_schedule_timeout(MAX_SCHEDULE_TIMEOUT)
 #define	schedule_timeout(timeout)			\
 	linux_schedule_timeout(timeout)
 #define	schedule_timeout_killable(timeout)		\
 	schedule_timeout_uninterruptible(timeout)
 #define	schedule_timeout_interruptible(timeout) ({	\
 	set_current_state(TASK_INTERRUPTIBLE);		\
 	schedule_timeout(timeout);			\
 })
 #define	schedule_timeout_uninterruptible(timeout) ({	\
 	set_current_state(TASK_UNINTERRUPTIBLE);	\
 	schedule_timeout(timeout);			\
 })
 
 #define	io_schedule()			schedule()
 #define	io_schedule_timeout(timeout)	schedule_timeout(timeout)
 
 #endif	/* _LINUX_SCHED_H_ */
Index: head/sys/contrib/rdma/krping/krping.c
===================================================================
--- head/sys/contrib/rdma/krping/krping.c	(revision 320071)
+++ head/sys/contrib/rdma/krping/krping.c	(revision 320072)
@@ -1,3347 +1,3348 @@
 /*
  * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
  * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/in.h>
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 
 #include <asm/atomic.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 
 #include "krping.h"
 #include "getopt.h"
 
 extern int krping_debug;
 #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x)
 #define PRINTF(cb, x...) log(LOG_INFO, x)
 #define BIND_INFO 1
 
 MODULE_AUTHOR("Steve Wise");
 MODULE_DESCRIPTION("RDMA ping client/server");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(krping, 1);
 MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
 
 static __inline uint64_t
 get_cycles(void)
 {
 	uint32_t low, high;
 	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
 	return (low | ((u_int64_t)high << 32));
 }
 
 typedef uint64_t cycles_t;
 
 enum mem_type {
 	DMA = 1,
 	FASTREG = 2,
 	MW = 3,
 	MR = 4
 };
 
 static const struct krping_option krping_opts[] = {
 	{"count", OPT_INT, 'C'},
 	{"size", OPT_INT, 'S'},
 	{"addr", OPT_STRING, 'a'},
 	{"port", OPT_INT, 'p'},
 	{"verbose", OPT_NOPARAM, 'v'},
 	{"validate", OPT_NOPARAM, 'V'},
 	{"server", OPT_NOPARAM, 's'},
 	{"client", OPT_NOPARAM, 'c'},
 	{"mem_mode", OPT_STRING, 'm'},
 	{"server_inv", OPT_NOPARAM, 'I'},
  	{"wlat", OPT_NOPARAM, 'l'},
  	{"rlat", OPT_NOPARAM, 'L'},
  	{"bw", OPT_NOPARAM, 'B'},
  	{"duplex", OPT_NOPARAM, 'd'},
  	{"txdepth", OPT_INT, 'T'},
  	{"poll", OPT_NOPARAM, 'P'},
  	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
  	{"read_inv", OPT_NOPARAM, 'R'},
  	{"fr", OPT_INT, 'f'},
 	{NULL, 0, 0}
 };
 
 #define htonll(x) cpu_to_be64((x))
 #define ntohll(x) cpu_to_be64((x))
 
 static struct mutex krping_mutex;
 
 /*
  * List of running krping threads.
  */
 static LIST_HEAD(krping_cbs);
 
 /*
  * krping "ping/pong" loop:
  * 	client sends source rkey/addr/len
  *	server receives source rkey/add/len
  *	server rdma reads "ping" data from source
  * 	server sends "go ahead" on rdma read completion
  *	client sends sink rkey/addr/len
  * 	server receives sink rkey/addr/len
  * 	server rdma writes "pong" data to sink
  * 	server sends "go ahead" on rdma write completion
  * 	<repeat loop>
  */
 
 /*
  * These states are used to signal events between the completion handler
  * and the main client or server thread.
  *
  * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
  * and RDMA_WRITE_COMPLETE for each ping.
  */
 enum test_state {
 	IDLE = 1,
 	CONNECT_REQUEST,
 	ADDR_RESOLVED,
 	ROUTE_RESOLVED,
 	CONNECTED,
 	RDMA_READ_ADV,
 	RDMA_READ_COMPLETE,
 	RDMA_WRITE_ADV,
 	RDMA_WRITE_COMPLETE,
 	ERROR
 };
 
 struct krping_rdma_info {
 	uint64_t buf;
 	uint32_t rkey;
 	uint32_t size;
 };
 
 /*
  * Default max buffer size for IO...
  */
 #define RPING_BUFSIZE 128*1024
 #define RPING_SQ_DEPTH 64
 
 /*
  * Control block struct.
  */
 struct krping_cb {
 	void *cookie;
 	int server;			/* 0 iff client */
 	struct ib_cq *cq;
 	struct ib_pd *pd;
 	struct ib_qp *qp;
 
 	enum mem_type mem;
 	struct ib_mr *dma_mr;
 
 	struct ib_fast_reg_page_list *page_list;
 	int page_list_len;
 	struct ib_send_wr fastreg_wr;
 	struct ib_send_wr invalidate_wr;
 	struct ib_mr *fastreg_mr;
 	int server_invalidate;
 	int read_inv;
 	u8 key;
 
 	struct ib_mw *mw;
 	struct ib_mw_bind bind_attr;
 
 	struct ib_recv_wr rq_wr;	/* recv work request record */
 	struct ib_sge recv_sgl;		/* recv single SGE */
 	struct krping_rdma_info recv_buf;/* malloc'd buffer */
 	u64 recv_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
 	struct ib_mr *recv_mr;
 
 	struct ib_send_wr sq_wr;	/* send work requrest record */
 	struct ib_sge send_sgl;
 	struct krping_rdma_info send_buf;/* single send buf */
 	u64 send_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(send_mapping)
 	struct ib_mr *send_mr;
 
 	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
 	struct ib_sge rdma_sgl;		/* rdma single SGE */
 	char *rdma_buf;			/* used as rdma sink */
 	u64  rdma_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
 	struct ib_mr *rdma_mr;
 
 	uint32_t remote_rkey;		/* remote guys RKEY */
 	uint64_t remote_addr;		/* remote guys TO */
 	uint32_t remote_len;		/* remote guys LEN */
 
 	char *start_buf;		/* rdma read src */
 	u64  start_dma_addr;
 	DECLARE_PCI_UNMAP_ADDR(start_mapping)
 	struct ib_mr *start_mr;
 
 	enum test_state state;		/* used for cond/signalling */
 	wait_queue_head_t sem;
 	struct krping_stats stats;
 
 	uint16_t port;			/* dst port in NBO */
 	struct in_addr addr;		/* dst addr in NBO */
 	char *addr_str;			/* dst addr string */
 	int verbose;			/* verbose logging */
 	int count;			/* ping count */
 	int size;			/* ping data size */
 	int validate;			/* validate ping data */
 	int wlat;			/* run wlat test */
 	int rlat;			/* run rlat test */
 	int bw;				/* run bw test */
 	int duplex;			/* run bw full duplex test */
 	int poll;			/* poll or block for rlat test */
 	int txdepth;			/* SQ depth */
 	int local_dma_lkey;		/* use 0 for lkey */
 	int frtest;			/* fastreg test */
 	int testnum;
 
 	/* CM stuff */
 	struct rdma_cm_id *cm_id;	/* connection on client side,*/
 					/* listener on server side. */
 	struct rdma_cm_id *child_cm_id;	/* connection on server side */
 	struct list_head list;
 };
 
 static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
 				   struct rdma_cm_event *event)
 {
 	int ret;
 	struct krping_cb *cb = cma_id->context;
 
 	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
 	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
 		cb->state = ADDR_RESOLVED;
 		ret = rdma_resolve_route(cma_id, 2000);
 		if (ret) {
 			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
 			wake_up_interruptible(&cb->sem);
 		}
 		break;
 
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
 		cb->state = ROUTE_RESOLVED;
 		cb->child_cm_id = cma_id;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
 		if (cb->state == IDLE) {
 			cb->state = CONNECT_REQUEST;
 			cb->child_cm_id = cma_id;
 		} else {
 			PRINTF(cb, "Received connection request in wrong state"
 			    " (%d)\n", cb->state);
 		}
 		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
 		DEBUG_LOG(cb, "ESTABLISHED\n");
 		if (!cb->server) {
 			cb->state = CONNECTED;
 		}
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_ADDR_ERROR:
 	case RDMA_CM_EVENT_ROUTE_ERROR:
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 	case RDMA_CM_EVENT_UNREACHABLE:
 	case RDMA_CM_EVENT_REJECTED:
 		PRINTF(cb, "cma event %d, error %d\n", event->event,
 		       event->status);
 		cb->state = ERROR;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_DISCONNECTED:
 		PRINTF(cb, "DISCONNECT EVENT...\n");
 		cb->state = ERROR;
 		wake_up_interruptible(&cb->sem);
 		break;
 
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 		PRINTF(cb, "cma detected device removal!!!!\n");
 		break;
 
 	default:
 		PRINTF(cb, "oof bad type!\n");
 		wake_up_interruptible(&cb->sem);
 		break;
 	}
 	return 0;
 }
 
 static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
 {
 	if (wc->byte_len != sizeof(cb->recv_buf)) {
 		PRINTF(cb, "Received bogus data, size %d\n", 
 		       wc->byte_len);
 		return -1;
 	}
 
 	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
 	cb->remote_addr = ntohll(cb->recv_buf.buf);
 	cb->remote_len  = ntohl(cb->recv_buf.size);
 	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
 		  cb->remote_rkey, (unsigned long long)cb->remote_addr, 
 		  cb->remote_len);
 
 	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
 		cb->state = RDMA_READ_ADV;
 	else
 		cb->state = RDMA_WRITE_ADV;
 
 	return 0;
 }
 
 static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
 {
 	if (wc->byte_len != sizeof(cb->recv_buf)) {
 		PRINTF(cb, "Received bogus data, size %d\n", 
 		       wc->byte_len);
 		return -1;
 	}
 
 	if (cb->state == RDMA_READ_ADV)
 		cb->state = RDMA_WRITE_ADV;
 	else
 		cb->state = RDMA_WRITE_COMPLETE;
 
 	return 0;
 }
 
 static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
 {
 	struct krping_cb *cb = ctx;
 	struct ib_wc wc;
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	BUG_ON(cb->cq != cq);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "cq completion in ERROR state\n");
 		return;
 	}
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest)
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
 		if (wc.status) {
 			if (wc.status == IB_WC_WR_FLUSH_ERR) {
 				DEBUG_LOG(cb, "cq flushed\n");
 				continue;
 			} else {
 				PRINTF(cb, "cq completion failed with "
 				       "wr_id %jx status %d opcode %d vender_err %x\n",
 					(uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
 				goto error;
 			}
 		}
 
 		switch (wc.opcode) {
 		case IB_WC_SEND:
 			DEBUG_LOG(cb, "send completion\n");
 			cb->stats.send_bytes += cb->send_sgl.length;
 			cb->stats.send_msgs++;
 			break;
 
 		case IB_WC_RDMA_WRITE:
 			DEBUG_LOG(cb, "rdma write completion\n");
 			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
 			cb->stats.write_msgs++;
 			cb->state = RDMA_WRITE_COMPLETE;
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		case IB_WC_RDMA_READ:
 			DEBUG_LOG(cb, "rdma read completion\n");
 			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
 			cb->stats.read_msgs++;
 			cb->state = RDMA_READ_COMPLETE;
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		case IB_WC_RECV:
 			DEBUG_LOG(cb, "recv completion\n");
 			cb->stats.recv_bytes += sizeof(cb->recv_buf);
 			cb->stats.recv_msgs++;
 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest)
 				ret = server_recv(cb, &wc);
 			else
 				ret = cb->server ? server_recv(cb, &wc) :
 						   client_recv(cb, &wc);
 			if (ret) {
 				PRINTF(cb, "recv wc error: %d\n", ret);
 				goto error;
 			}
 
 			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 			if (ret) {
 				PRINTF(cb, "post recv error: %d\n", 
 				       ret);
 				goto error;
 			}
 			wake_up_interruptible(&cb->sem);
 			break;
 
 		default:
 			PRINTF(cb, 
 			       "%s:%d Unexpected opcode %d, Shutting down\n",
 			       __func__, __LINE__, wc.opcode);
 			goto error;
 		}
 	}
 	if (ret) {
 		PRINTF(cb, "poll error %d\n", ret);
 		goto error;
 	}
 	return;
 error:
 	cb->state = ERROR;
 	wake_up_interruptible(&cb->sem);
 }
 
 static int krping_accept(struct krping_cb *cb)
 {
 	struct rdma_conn_param conn_param;
 	int ret;
 
 	DEBUG_LOG(cb, "accepting client connection request\n");
 
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 
 	ret = rdma_accept(cb->child_cm_id, &conn_param);
 	if (ret) {
 		PRINTF(cb, "rdma_accept error: %d\n", ret);
 		return ret;
 	}
 
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
 		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
 		if (cb->state == ERROR) {
 			PRINTF(cb, "wait for CONNECTED state %d\n", 
 				cb->state);
 			return -1;
 		}
 	}
 	return 0;
 }
 
 static void krping_setup_wr(struct krping_cb *cb)
 {
 	cb->recv_sgl.addr = cb->recv_dma_addr;
 	cb->recv_sgl.length = sizeof cb->recv_buf;
 	if (cb->local_dma_lkey)
 		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
 	else if (cb->mem == DMA)
 		cb->recv_sgl.lkey = cb->dma_mr->lkey;
 	else
 		cb->recv_sgl.lkey = cb->recv_mr->lkey;
 	cb->rq_wr.sg_list = &cb->recv_sgl;
 	cb->rq_wr.num_sge = 1;
 
 	cb->send_sgl.addr = cb->send_dma_addr;
 	cb->send_sgl.length = sizeof cb->send_buf;
 	if (cb->local_dma_lkey)
 		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
 	else if (cb->mem == DMA)
 		cb->send_sgl.lkey = cb->dma_mr->lkey;
 	else
 		cb->send_sgl.lkey = cb->send_mr->lkey;
 
 	cb->sq_wr.opcode = IB_WR_SEND;
 	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
 	cb->sq_wr.sg_list = &cb->send_sgl;
 	cb->sq_wr.num_sge = 1;
 
 	if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 		cb->rdma_sgl.addr = cb->rdma_dma_addr;
 		if (cb->mem == MR)
 			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
 		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
 		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
 		cb->rdma_sq_wr.num_sge = 1;
 	}
 
 	switch(cb->mem) {
 	case FASTREG:
 
 		/* 
 		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
 		 * both unsignaled.  The client uses them to reregister
 		 * the rdma buffers with a new key each iteration.
 		 */
 		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
 		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
 		cb->fastreg_wr.wr.fast_reg.length = cb->size;
 		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
 		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
 
 		cb->invalidate_wr.next = &cb->fastreg_wr;
 		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
 		break;
 	case MW:
 		cb->bind_attr.wr_id = 0xabbaabba;
 		cb->bind_attr.send_flags = 0; /* unsignaled */
 #ifdef BIND_INFO
 		cb->bind_attr.bind_info.length = cb->size;
 #else
 		cb->bind_attr.length = cb->size;
 #endif
 		break;
 	default:
 		break;
 	}
 }
 
 static int krping_setup_buffers(struct krping_cb *cb)
 {
 	int ret;
 	struct ib_phys_buf buf;
 	u64 iovbase;
 
 	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
 
 	cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, 
 				   &cb->recv_buf, 
 				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
 	cb->send_dma_addr = ib_dma_map_single(cb->pd->device, 
 					   &cb->send_buf, sizeof(cb->send_buf),
 					   DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
 
 	if (cb->mem == DMA) {
 		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
 					   IB_ACCESS_REMOTE_READ|
 				           IB_ACCESS_REMOTE_WRITE);
 		if (IS_ERR(cb->dma_mr)) {
 			DEBUG_LOG(cb, "reg_dmamr failed\n");
 			ret = PTR_ERR(cb->dma_mr);
 			goto bail;
 		}
 	} else {
 		if (!cb->local_dma_lkey) {
 			buf.addr = cb->recv_dma_addr;
 			buf.size = sizeof cb->recv_buf;
 			DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n",
 			    (uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->recv_dma_addr;
 			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						     IB_ACCESS_LOCAL_WRITE, 
 						     &iovbase);
 
 			if (IS_ERR(cb->recv_mr)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->recv_mr);
 				goto bail;
 			}
 
 			buf.addr = cb->send_dma_addr;
 			buf.size = sizeof cb->send_buf;
 			DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n",
 			    (uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->send_dma_addr;
 			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						     0, &iovbase);
 
 			if (IS_ERR(cb->send_mr)) {
 				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->send_mr);
 				goto bail;
 			}
 		}
 	}
 
 	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
 	if (!cb->rdma_buf) {
 		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
 		ret = -ENOMEM;
 		goto bail;
 	}
 
 	cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device, 
 			       cb->rdma_buf, cb->size, 
 			       DMA_BIDIRECTIONAL);
 	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
 	if (cb->mem != DMA) {
 		switch (cb->mem) {
 		case FASTREG:
 			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
 				PAGE_SIZE) >> PAGE_SHIFT;
 			cb->page_list = ib_alloc_fast_reg_page_list(
 						cb->pd->device, 
 						cb->page_list_len);
 			if (IS_ERR(cb->page_list)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->page_list);
 				goto bail;
 			}
 			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, 
 					cb->page_list->max_page_list_len);
 			if (IS_ERR(cb->fastreg_mr)) {
 				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->fastreg_mr);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
 				" page_list_len %u\n", cb->fastreg_mr->rkey, 
 				cb->page_list, cb->page_list_len);
 			break;
 		case MW:
 			cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1);
 			if (IS_ERR(cb->mw)) {
 				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
 				ret = PTR_ERR(cb->mw);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
 			/*FALLTHROUGH*/
 		case MR:
 			buf.addr = cb->rdma_dma_addr;
 			buf.size = cb->size;
 			iovbase = cb->rdma_dma_addr;
 			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 						IB_ACCESS_LOCAL_WRITE|
 					     IB_ACCESS_REMOTE_READ| 
 					     IB_ACCESS_REMOTE_WRITE, 
 					     &iovbase);
 			if (IS_ERR(cb->rdma_mr)) {
 				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->rdma_mr);
 				goto bail;
 			}
 			DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n",
 				(uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey);
 			break;
 		default:
 			ret = -EINVAL;
 			goto bail;
 			break;
 		}
 	}
 
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 
 		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
 		if (!cb->start_buf) {
 			DEBUG_LOG(cb, "start_buf malloc failed\n");
 			ret = -ENOMEM;
 			goto bail;
 		}
 
 		cb->start_dma_addr = ib_dma_map_single(cb->pd->device, 
 						   cb->start_buf, cb->size, 
 						   DMA_BIDIRECTIONAL);
 		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
 
 		if (cb->mem == MR || cb->mem == MW) {
 			unsigned flags = IB_ACCESS_REMOTE_READ;
 
 			if (cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 				flags |= IB_ACCESS_LOCAL_WRITE |
 					IB_ACCESS_REMOTE_WRITE;
 			}
 
 			buf.addr = cb->start_dma_addr;
 			buf.size = cb->size;
 			DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n",
 				(uintmax_t)buf.addr, (int)buf.size);
 			iovbase = cb->start_dma_addr;
 			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 
 					     flags,
 					     &iovbase);
 
 			if (IS_ERR(cb->start_mr)) {
 				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
 				ret = PTR_ERR(cb->start_mr);
 				goto bail;
 			}
 		}
 	}
 
 	krping_setup_wr(cb);
 	DEBUG_LOG(cb, "allocated & registered buffers...\n");
 	return 0;
 bail:
 	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
 		ib_dereg_mr(cb->fastreg_mr);
 	if (cb->mw && !IS_ERR(cb->mw))
 		ib_dealloc_mw(cb->mw);
 	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
 		ib_dereg_mr(cb->rdma_mr);
 	if (cb->page_list && !IS_ERR(cb->page_list))
 		ib_free_fast_reg_page_list(cb->page_list);
 	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
 		ib_dereg_mr(cb->dma_mr);
 	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
 		ib_dereg_mr(cb->recv_mr);
 	if (cb->send_mr && !IS_ERR(cb->send_mr))
 		ib_dereg_mr(cb->send_mr);
 	if (cb->rdma_buf)
 		kfree(cb->rdma_buf);
 	if (cb->start_buf)
 		kfree(cb->start_buf);
 	return ret;
 }
 
 static void krping_free_buffers(struct krping_cb *cb)
 {
 	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
 	
 	if (cb->dma_mr)
 		ib_dereg_mr(cb->dma_mr);
 	if (cb->send_mr)
 		ib_dereg_mr(cb->send_mr);
 	if (cb->recv_mr)
 		ib_dereg_mr(cb->recv_mr);
 	if (cb->rdma_mr)
 		ib_dereg_mr(cb->rdma_mr);
 	if (cb->start_mr)
 		ib_dereg_mr(cb->start_mr);
 	if (cb->fastreg_mr)
 		ib_dereg_mr(cb->fastreg_mr);
 	if (cb->mw)
 		ib_dealloc_mw(cb->mw);
 
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, recv_mapping),
 			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, send_mapping),
 			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
 	dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, rdma_mapping),
 			 cb->size, DMA_BIDIRECTIONAL);
 	kfree(cb->rdma_buf);
 	if (cb->start_buf) {
 		dma_unmap_single(cb->pd->device->dma_device,
 			 pci_unmap_addr(cb, start_mapping),
 			 cb->size, DMA_BIDIRECTIONAL);
 		kfree(cb->start_buf);
 	}
 }
 
 static int krping_create_qp(struct krping_cb *cb)
 {
 	struct ib_qp_init_attr init_attr;
 	int ret;
 
 	memset(&init_attr, 0, sizeof(init_attr));
 	init_attr.cap.max_send_wr = cb->txdepth;
 	init_attr.cap.max_recv_wr = 2;
 	init_attr.cap.max_recv_sge = 1;
 	init_attr.cap.max_send_sge = 1;
 	init_attr.qp_type = IB_QPT_RC;
 	init_attr.send_cq = cb->cq;
 	init_attr.recv_cq = cb->cq;
 	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 
 	if (cb->server) {
 		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
 		if (!ret)
 			cb->qp = cb->child_cm_id->qp;
 	} else {
 		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
 		if (!ret)
 			cb->qp = cb->cm_id->qp;
 	}
 
 	return ret;
 }
 
 static void krping_free_qp(struct krping_cb *cb)
 {
 	ib_destroy_qp(cb->qp);
 	ib_destroy_cq(cb->cq);
 	ib_dealloc_pd(cb->pd);
 }
 
 static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
 {
 	int ret;
 	cb->pd = ib_alloc_pd(cm_id->device);
 	if (IS_ERR(cb->pd)) {
 		PRINTF(cb, "ib_alloc_pd failed\n");
 		return PTR_ERR(cb->pd);
 	}
 	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
 
 	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
 
 	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
 			      cb, cb->txdepth * 2, 0);
 	if (IS_ERR(cb->cq)) {
 		PRINTF(cb, "ib_create_cq failed\n");
 		ret = PTR_ERR(cb->cq);
 		goto err1;
 	}
 	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
 
 	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
 		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 		if (ret) {
 			PRINTF(cb, "ib_create_cq failed\n");
 			goto err2;
 		}
 	}
 
 	ret = krping_create_qp(cb);
 	if (ret) {
 		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
 		goto err2;
 	}
 	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
 	return 0;
 err2:
 	ib_destroy_cq(cb->cq);
 err1:
 	ib_dealloc_pd(cb->pd);
 	return ret;
 }
 
 /*
  * return the (possibly rebound) rkey for the rdma buffer.
  * FASTREG mode: invalidate and rebind via fastreg wr.
  * MW mode: rebind the MW.
  * other modes: just return the mr rkey.
  */
 static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
 {
 	u32 rkey = 0xffffffff;
 	u64 p;
 	struct ib_send_wr *bad_wr;
 	int i;
 	int ret;
 
 	switch (cb->mem) {
 	case FASTREG:
 		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
 
 		/*
 		 * Update the fastreg key.
 		 */
 		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
 		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
 
 		/*
 		 * Update the fastreg WR with new buf info.
 		 */
 		if (buf == (u64)cb->start_dma_addr)
 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
 		else
 			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
 		p = (u64)(buf & PAGE_MASK);
 		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; 
 		     i++, p += PAGE_SIZE) {
 			cb->page_list->page_list[i] = p;
 			DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p);
 		}
 
 		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
 			" iova_start %jx page_list_len %u\n",
 			post_inv,
 			cb->fastreg_wr.wr.fast_reg.rkey,
 			cb->fastreg_wr.wr.fast_reg.page_shift,
 			(unsigned)cb->fastreg_wr.wr.fast_reg.length,
 			(uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start,
 			cb->fastreg_wr.wr.fast_reg.page_list_len);
 
 		if (post_inv)
 			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
 		else
 			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			cb->state = ERROR;
 		}
 		rkey = cb->fastreg_mr->rkey;
 		break;
 	case MW:
 		/*
 		 * Update the MW with new buf info.
 		 */
 		if (buf == (u64)cb->start_dma_addr) {
 #ifdef BIND_INFO
 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ;
 			cb->bind_attr.bind_info.mr = cb->start_mr;
 #else
 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
 			cb->bind_attr.mr = cb->start_mr;
 #endif
 		} else {
 #ifdef BIND_INFO
 			cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
 			cb->bind_attr.bind_info.mr = cb->rdma_mr;
 #else
 			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
 			cb->bind_attr.mr = cb->rdma_mr;
 #endif
 		}
 #ifdef BIND_INFO
 		cb->bind_attr.bind_info.addr = buf;
 #else
 		cb->bind_attr.addr = buf;
 #endif
 		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n",
 #ifdef BIND_INFO
 			cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey);
 #else
 			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
 #endif
 		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
 		if (ret) {
 			PRINTF(cb, "bind mw error %d\n", ret);
 			cb->state = ERROR;
 		} else
 			rkey = cb->mw->rkey;
 		break;
 	case MR:
 		if (buf == (u64)cb->start_dma_addr)
 			rkey = cb->start_mr->rkey;
 		else
 			rkey = cb->rdma_mr->rkey;
 		break;
 	case DMA:
 		rkey = cb->dma_mr->rkey;
 		break;
 	default:
 		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
 		cb->state = ERROR;
 		break;
 	}
 	return rkey;
 }
 
 static void krping_format_send(struct krping_cb *cb, u64 buf)
 {
 	struct krping_rdma_info *info = &cb->send_buf;
 	u32 rkey;
 
 	/*
 	 * Client side will do fastreg or mw bind before
 	 * advertising the rdma buffer.  Server side
 	 * sends have no data.
 	 */
 	if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
 		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
 		info->buf = htonll(buf);
 		info->rkey = htonl(rkey);
 		info->size = htonl(cb->size);
 		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
 			  (unsigned long long)buf, rkey, cb->size);
 	}
 }
 
 static void krping_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr, inv;
 	int ret;
 
 	while (1) {
 		/* Wait for client's Start STAG/TO/Len */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
 		if (cb->state != RDMA_READ_ADV) {
 			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
 				cb->state);
 			break;
 		}
 
 		DEBUG_LOG(cb, "server received sink adv\n");
 
 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
 		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
 
 		/* Issue RDMA Read. */
 		if (cb->read_inv)
 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 		else {
 
 			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
 			if (cb->mem == FASTREG) {
 				/* 
 				 * Immediately follow the read with a 
 				 * fenced LOCAL_INV.
 				 */
 				cb->rdma_sq_wr.next = &inv;
 				memset(&inv, 0, sizeof inv);
 				inv.opcode = IB_WR_LOCAL_INV;
 				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
 				inv.send_flags = IB_SEND_FENCE;
 			}
 		}
 
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		cb->rdma_sq_wr.next = NULL;
 
 		DEBUG_LOG(cb, "server posted rdma read req \n");
 
 		/* Wait for read completion */
 		wait_event_interruptible(cb->sem, 
 					 cb->state >= RDMA_READ_COMPLETE);
 		if (cb->state != RDMA_READ_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_READ_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server received read complete\n");
 
 		/* Display data in recv buf */
 		if (cb->verbose) {
 			if (strlen(cb->rdma_buf) > 128) {
 				char msgbuf[128];
 
 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
 				PRINTF(cb, "server ping data stripped: %s\n",
 				       msgbuf);
 			} else
 				PRINTF(cb, "server ping data: %s\n",
 				       cb->rdma_buf);
 		}
 
 		/* Tell client to continue */
 		if (cb->server && cb->server_invalidate) {
 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
 		} 
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		DEBUG_LOG(cb, "server posted go ahead\n");
 
 		/* Wait for client's RDMA STAG/TO/Len */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
 		if (cb->state != RDMA_WRITE_ADV) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_ADV state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server received sink adv\n");
 
 		/* RDMA Write echo data */
 		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
 		if (cb->local_dma_lkey)
 			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
 		else 
 			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
 			
 		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
 			  cb->rdma_sq_wr.sg_list->lkey,
 			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
 			  cb->rdma_sq_wr.sg_list->length);
 
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for completion */
 		ret = wait_event_interruptible(cb->sem, cb->state >= 
 							 RDMA_WRITE_COMPLETE);
 		if (cb->state != RDMA_WRITE_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 		DEBUG_LOG(cb, "server rdma write complete \n");
 
 		cb->state = CONNECTED;
 
 		/* Tell client to begin again */
 		if (cb->server && cb->server_invalidate) {
 			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
 			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
 			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
 		} 
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 		DEBUG_LOG(cb, "server posted go ahead\n");
 	}
 }
 
 static void rlat_test(struct krping_cb *cb)
 {
 	int scnt;
 	int iters = cb->count;
 	struct timeval start_tv, stop_tv;
 	int ret;
 	struct ib_wc wc;
 	struct ib_send_wr *bad_wr;
 	int ne;
 
 	scnt = 0;
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	microtime(&start_tv);
 	if (!cb->poll) {
 		cb->state = RDMA_READ_ADV;
 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
 	}
 	while (scnt < iters) {
 
 		cb->state = RDMA_READ_ADV;
 		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, 
 				"Couldn't post send: ret=%d scnt %d\n",
 				ret, scnt);
 			return;
 		}
 
 		do {
 			if (!cb->poll) {
 				wait_event_interruptible(cb->sem, 
 					cb->state != RDMA_READ_ADV);
 				if (cb->state == RDMA_READ_COMPLETE) {
 					ne = 1;
 					ib_req_notify_cq(cb->cq, 
 						IB_CQ_NEXT_COMP);
 				} else {
 					ne = -1;
 				}
 			} else
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			if (cb->state == ERROR) {
 				PRINTF(cb, 
 					"state == ERROR...bailing scnt %d\n", 
 					scnt);
 				return;
 			}
 		} while (ne == 0);
 
 		if (ne < 0) {
 			PRINTF(cb, "poll CQ failed %d\n", ne);
 			return;
 		}
 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
 			PRINTF(cb, "Completion wth error at %s:\n",
 				cb->server ? "server" : "client");
 			PRINTF(cb, "Failed status %d: wr_id %d\n",
 				wc.status, (int) wc.wr_id);
 			return;
 		}
 		++scnt;
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size);
 }
 
 static void wlat_test(struct krping_cb *cb)
 {
 	int ccnt, scnt, rcnt;
 	int iters=cb->count;
 	volatile char *poll_buf = (char *) cb->start_buf;
 	char *buf = (char *)cb->rdma_buf;
 	struct timeval start_tv, stop_tv;
 	cycles_t *post_cycles_start, *post_cycles_stop;
 	cycles_t *poll_cycles_start, *poll_cycles_stop;
 	cycles_t *last_poll_cycles_start;
 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
 	int i;
 	int cycle_iters = 1000;
 
 	ccnt = 0;
 	scnt = 0;
 	rcnt = 0;
 
 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
 		GFP_KERNEL);
 	if (!last_poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	if (cycle_iters > iters)
 		cycle_iters = iters;
 	microtime(&start_tv);
 	while (scnt < iters || ccnt < iters || rcnt < iters) {
 
 		/* Wait till buffer changes. */
 		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
 			++rcnt;
 			while (*poll_buf != (char)rcnt) {
 				if (cb->state == ERROR) {
 					PRINTF(cb, 
 						"state = ERROR, bailing\n");
 					return;
 				}
 			}
 		}
 
 		if (scnt < iters) {
 			struct ib_send_wr *bad_wr;
 
 			*buf = (char)scnt+1;
 			if (scnt < cycle_iters)
 				post_cycles_start[scnt] = get_cycles();
 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 				PRINTF(cb, 
 					"Couldn't post send: scnt=%d\n",
 					scnt);
 				return;
 			}
 			if (scnt < cycle_iters)
 				post_cycles_stop[scnt] = get_cycles();
 			scnt++;
 		}
 
 		if (ccnt < iters) {
 			struct ib_wc wc;
 			int ne;
 
 			if (ccnt < cycle_iters)
 				poll_cycles_start[ccnt] = get_cycles();
 			do {
 				if (ccnt < cycle_iters)
 					last_poll_cycles_start[ccnt] = 
 						get_cycles();
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			} while (ne == 0);
 			if (ccnt < cycle_iters)
 				poll_cycles_stop[ccnt] = get_cycles();
 			++ccnt;
 
 			if (ne < 0) {
 				PRINTF(cb, "poll CQ failed %d\n", ne);
 				return;
 			}
 			if (wc.status != IB_WC_SUCCESS) {
 				PRINTF(cb, 
 					"Completion wth error at %s:\n",
 					cb->server ? "server" : "client");
 				PRINTF(cb, 
 					"Failed status %d: wr_id %d\n",
 					wc.status, (int) wc.wr_id);
 				PRINTF(cb, 
 					"scnt=%d, rcnt=%d, ccnt=%d\n",
 					scnt, rcnt, ccnt);
 				return;
 			}
 		}
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	for (i=0; i < cycle_iters; i++) {
 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
 	}
 	PRINTF(cb,
 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size, cycle_iters,
 		(unsigned long long)sum_post, (unsigned long long)sum_poll, 
 		(unsigned long long)sum_last_poll);
 	kfree(post_cycles_start);
 	kfree(post_cycles_stop);
 	kfree(poll_cycles_start);
 	kfree(poll_cycles_stop);
 	kfree(last_poll_cycles_start);
 }
 
 static void bw_test(struct krping_cb *cb)
 {
 	int ccnt, scnt, rcnt;
 	int iters=cb->count;
 	struct timeval start_tv, stop_tv;
 	cycles_t *post_cycles_start, *post_cycles_stop;
 	cycles_t *poll_cycles_start, *poll_cycles_stop;
 	cycles_t *last_poll_cycles_start;
 	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
 	int i;
 	int cycle_iters = 1000;
 
 	ccnt = 0;
 	scnt = 0;
 	rcnt = 0;
 
 	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!post_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
 	if (!poll_cycles_stop) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), 
 		GFP_KERNEL);
 	if (!last_poll_cycles_start) {
 		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
 		return;
 	}
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = cb->size;
 
 	if (cycle_iters > iters)
 		cycle_iters = iters;
 	microtime(&start_tv);
 	while (scnt < iters || ccnt < iters) {
 
 		while (scnt < iters && scnt - ccnt < cb->txdepth) {
 			struct ib_send_wr *bad_wr;
 
 			if (scnt < cycle_iters)
 				post_cycles_start[scnt] = get_cycles();
 			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 				PRINTF(cb, 
 					"Couldn't post send: scnt=%d\n",
 					scnt);
 				return;
 			}
 			if (scnt < cycle_iters)
 				post_cycles_stop[scnt] = get_cycles();
 			++scnt;
 		}
 
 		if (ccnt < iters) {
 			int ne;
 			struct ib_wc wc;
 
 			if (ccnt < cycle_iters)
 				poll_cycles_start[ccnt] = get_cycles();
 			do {
 				if (ccnt < cycle_iters)
 					last_poll_cycles_start[ccnt] = 
 						get_cycles();
 				ne = ib_poll_cq(cb->cq, 1, &wc);
 			} while (ne == 0);
 			if (ccnt < cycle_iters)
 				poll_cycles_stop[ccnt] = get_cycles();
 			ccnt += 1;
 
 			if (ne < 0) {
 				PRINTF(cb, "poll CQ failed %d\n", ne);
 				return;
 			}
 			if (wc.status != IB_WC_SUCCESS) {
 				PRINTF(cb, 
 					"Completion wth error at %s:\n",
 					cb->server ? "server" : "client");
 				PRINTF(cb, 
 					"Failed status %d: wr_id %d\n",
 					wc.status, (int) wc.wr_id);
 				return;
 			}
 		}
 	}
 	microtime(&stop_tv);
 
         if (stop_tv.tv_usec < start_tv.tv_usec) {
                 stop_tv.tv_usec += 1000000;
                 stop_tv.tv_sec  -= 1;
         }
 
 	for (i=0; i < cycle_iters; i++) {
 		sum_post += post_cycles_stop[i] - post_cycles_start[i];
 		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
 		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
 	}
 	PRINTF(cb,
 		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
 		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
 		(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
 		(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
 		scnt, cb->size, cycle_iters, 
 		(unsigned long long)sum_post, (unsigned long long)sum_poll, 
 		(unsigned long long)sum_last_poll);
 	kfree(post_cycles_start);
 	kfree(post_cycles_stop);
 	kfree(poll_cycles_start);
 	kfree(poll_cycles_stop);
 	kfree(last_poll_cycles_start);
 }
 
 static void krping_rlat_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_wlat_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	wlat_test(cb);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_bw_test_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		bw_test(cb);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static int fastreg_supported(struct krping_cb *cb, int server)
 {
 	struct ib_device *dev = server?cb->child_cm_id->device:
 					cb->cm_id->device;
 	struct ib_device_attr attr;
 	int ret;
 
 	ret = ib_query_device(dev, &attr);
 	if (ret) {
 		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
 		return 0;
 	}
 	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
 		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n",
 		    (unsigned long long)attr.device_cap_flags);
 		return 0;
 	}
 	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n",
 		(uintmax_t)attr.device_cap_flags);
 	return 1;
 }
 
 static int krping_bind_server(struct krping_cb *cb)
 {
 	struct sockaddr_in sin;
 	int ret;
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof sin;
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = cb->addr.s_addr;
 	sin.sin_port = cb->port;
 
 	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
 	if (ret) {
 		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
 		return ret;
 	}
 	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
 
 	DEBUG_LOG(cb, "rdma_listen\n");
 	ret = rdma_listen(cb->cm_id, 3);
 	if (ret) {
 		PRINTF(cb, "rdma_listen failed: %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
 	if (cb->state != CONNECT_REQUEST) {
 		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
 			cb->state);
 		return -1;
 	}
 
 	if (cb->mem == FASTREG && !fastreg_supported(cb, 1))
 		return -EINVAL;
 
 	return 0;
 }
 
 /*
  * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads
  * complete.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  */
 static void krping_fr_test5(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list **pl;
 	struct ib_send_wr *fr, *read, *bad;
 	struct ib_wc wc;
 	struct ib_sge *sgl;
 	u8 key = 0;
 	struct ib_mr **mr;
 	u8 **buf;
 	dma_addr_t *dma_addr;
 	int i;
 	int ret;
 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	time_t start;
 	int count = 0;
 	int scnt;
 	int depth = cb->txdepth >> 1;
 
 	if (!depth) {
 		PRINTF(cb, "txdepth must be > 1 for this test!\n");
 		return;
 	}
 
 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
 	read = kzalloc(sizeof *read * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth);
 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
 	if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) {
 		PRINTF(cb, "kzalloc failed\n");
 		goto err1;
 	}
 
 	for (scnt = 0; scnt < depth; scnt++) {
 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 		if (IS_ERR(pl[scnt])) {
 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
 			       PTR_ERR(pl[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
 
 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
 		if (IS_ERR(mr[scnt])) {
 			PRINTF(cb, "alloc_fr failed %ld\n",
 			       PTR_ERR(mr[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
 		ib_update_fast_reg_key(mr[scnt], ++key);
 
 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
 		if (!buf[scnt]) {
 			PRINTF(cb, "kmalloc failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
 		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
 						   buf[scnt], cb->size,
 						   DMA_BIDIRECTIONAL);
 		if (dma_mapping_error(cb->pd->device->dma_device,
 		    dma_addr[scnt])) {
 			PRINTF(cb, "dma_map failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
 		for (i=0; i<plen; i++) {
 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
 		}
 
 		sgl[scnt].lkey = mr[scnt]->rkey;
 		sgl[scnt].length = cb->size;
 		sgl[scnt].addr = (u64)buf[scnt];
 		DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n",
 			  __func__, scnt,  sgl[scnt].lkey, sgl[scnt].length,
 			  (uintmax_t)sgl[scnt].addr);
 
 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
 		fr[scnt].wr_id = scnt;
 		fr[scnt].send_flags = 0;
 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
 		fr[scnt].wr.fast_reg.length = cb->size;
 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
 		fr[scnt].wr.fast_reg.page_list_len = plen;
 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
 		fr[scnt].next = &read[scnt];
 		read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV;
 		read[scnt].wr_id = scnt;
 		read[scnt].send_flags = IB_SEND_SIGNALED;
 		read[scnt].wr.rdma.rkey = cb->remote_rkey;
 		read[scnt].wr.rdma.remote_addr = cb->remote_addr;
 		read[scnt].num_sge = 1;
 		read[scnt].sg_list = &sgl[scnt];
 		ret = ib_post_send(cb->qp, &fr[scnt], &bad);
 		if (ret) {
 			PRINTF(cb, "ib_post_send failed %d\n", ret);
 			goto err2;
 		}
 	}
 
 	start = time_uptime;
 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
 	while (!cb->count || cb->server || count < cb->count) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
 				  count);
 			wait_event_interruptible_timeout(cb->sem,
 							 cb->state == ERROR,
 							 1);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n",
 				       ret);
 				goto err2;
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb,
 					       "completion error %u wr_id %ju "
 					       "opcode %d\n", wc.status,
 					       (uintmax_t)wc.wr_id, wc.opcode);
 					goto err2;
 				}
 				count++;
 				if (count == cb->count)
 					break;
 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
 				fr[wc.wr_id].wr.fast_reg.rkey =
 					mr[wc.wr_id]->rkey;
 				sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey;
 				ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad);
 				if (ret) {
 					PRINTF(cb,
 					       "ib_post_send failed %d\n", ret);
 					goto err2;
 				}
 			} else if (krping_sigpending()) {
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u "
 				       "opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 
 	DEBUG_LOG(cb, "destroying fr mrs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (mr[scnt]) {
 			ib_dereg_mr(mr[scnt]);
 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (buf[scnt]) {
 			dma_unmap_single(cb->pd->device->dma_device,
 					 dma_addr[scnt], cb->size,
 					 DMA_BIDIRECTIONAL);
 			kfree(buf[scnt]);
 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "destroying fr page lists!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (pl[scnt]) {
 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
 			ib_free_fast_reg_page_list(pl[scnt]);
 		}
 	}
 err1:
 	if (pl)
 		kfree(pl);
 	if (mr)
 		kfree(mr);
 	if (fr)
 		kfree(fr);
 	if (read)
 		kfree(read);
 	if (sgl)
 		kfree(sgl);
 	if (buf)
 		kfree(buf);
 	if (dma_addr)
 		kfree(dma_addr);
 }
 static void krping_fr_test_server(struct krping_cb *cb)
 {
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test5_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		krping_fr_test5(cb);
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test5_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to server */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
 	    (uintmax_t)cb->remote_addr);
 
 	return krping_fr_test5(cb);
 }
 
 /*
  * sq-depth worth of write + fastreg + inv, reposting them as the invs
  * complete.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  * If a count is given, then the last IO will have a bogus lkey in the
  * write work request.  This reproduces a fw bug where the connection
  * will get stuck if a fastreg is processed while the ulptx is failing
  * the bad write.
  */
 static void krping_fr_test6(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list **pl;
 	struct ib_send_wr *fr, *write, *inv, *bad;
 	struct ib_wc wc;
 	struct ib_sge *sgl;
 	u8 key = 0;
 	struct ib_mr **mr;
 	u8 **buf;
 	dma_addr_t *dma_addr;
 	int i;
 	int ret;
 	int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	unsigned long start;
 	int count = 0;
 	int scnt;
 	int depth = cb->txdepth  / 3;
 
 	if (!depth) {
 		PRINTF(cb, "txdepth must be > 3 for this test!\n");
 		return;
 	}
 
 	pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
 
 	mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
 
 	fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
 
 	sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
 
 	write = kzalloc(sizeof *write * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth);
 
 	inv = kzalloc(sizeof *inv * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth);
 
 	buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
 
 	dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
 	DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
 
 	if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) {
 		PRINTF(cb, "kzalloc failed\n");
 		goto err1;
 	}
 
 	for (scnt = 0; scnt < depth; scnt++) {
 		pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 		if (IS_ERR(pl[scnt])) {
 			PRINTF(cb, "alloc_fr_page_list failed %ld\n",
 			       PTR_ERR(pl[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
 
 		mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
 		if (IS_ERR(mr[scnt])) {
 			PRINTF(cb, "alloc_fr failed %ld\n",
 			       PTR_ERR(mr[scnt]));
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
 		ib_update_fast_reg_key(mr[scnt], ++key);
 
 		buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
 		if (!buf[scnt]) {
 			PRINTF(cb, "kmalloc failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
 		dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
 						   buf[scnt], cb->size,
 						   DMA_BIDIRECTIONAL);
 		if (dma_mapping_error(cb->pd->device->dma_device,
 		    dma_addr[scnt])) {
 			PRINTF(cb, "dma_map failed\n");
 			ret = -ENOMEM;
 			goto err2;
 		}
 		DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
 		for (i=0; i<plen; i++) {
 			pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
 			DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
 				  __func__, scnt, i,  (uintmax_t)pl[scnt]->page_list[i]);
 		}
 
 		write[scnt].opcode = IB_WR_RDMA_WRITE;
 		write[scnt].wr_id = scnt;
 		write[scnt].wr.rdma.rkey = cb->remote_rkey;
 		write[scnt].wr.rdma.remote_addr = cb->remote_addr;
 		write[scnt].num_sge = 1;
 		write[scnt].sg_list = &cb->rdma_sgl;
 		write[scnt].sg_list->length = cb->size;
 		write[scnt].next = &fr[scnt];
 
 		fr[scnt].opcode = IB_WR_FAST_REG_MR;
 		fr[scnt].wr_id = scnt;
 		fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
 		fr[scnt].wr.fast_reg.length = cb->size;
 		fr[scnt].wr.fast_reg.page_list = pl[scnt];
 		fr[scnt].wr.fast_reg.page_list_len = plen;
 		fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
 		fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 		fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
 		fr[scnt].next = &inv[scnt];
 
 		inv[scnt].opcode = IB_WR_LOCAL_INV;
 		inv[scnt].send_flags = IB_SEND_SIGNALED;
 		inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey;
 
 		ret = ib_post_send(cb->qp, &write[scnt], &bad);
 		if (ret) {
 			PRINTF(cb, "ib_post_send failed %d\n", ret);
 			goto err2;
 		}
 	}
 
 	start = time_uptime;
 	DEBUG_LOG(cb, "%s starting IO.\n", __func__);
 	while (!cb->count || cb->server || count < cb->count) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
 				  count);
 			wait_event_interruptible_timeout(cb->sem,
 							 cb->state == ERROR,
 							 1);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n",
 				       ret);
 				goto err2;
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb,
 					       "completion error %u wr_id %ju "
 					       "opcode %d\n", wc.status,
 					       (uintmax_t)wc.wr_id, wc.opcode);
 					goto err2;
 				}
 				count++;
 				if (count == (cb->count -1))
 					cb->rdma_sgl.lkey = 0x00dead;
 				if (count == cb->count)
 					break;
 				ib_update_fast_reg_key(mr[wc.wr_id], ++key);
 				fr[wc.wr_id].wr.fast_reg.rkey =
 					mr[wc.wr_id]->rkey;
 				inv[wc.wr_id].ex.invalidate_rkey =
 					mr[wc.wr_id]->rkey;
 				ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad);
 				if (ret) {
 					PRINTF(cb,
 					       "ib_post_send failed %d\n", ret);
 					goto err2;
 				}
 			} else if (krping_sigpending()){
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u "
 				       "opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 
 	DEBUG_LOG(cb, "destroying fr mrs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (mr[scnt]) {
 			ib_dereg_mr(mr[scnt]);
 			DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (buf[scnt]) {
 			dma_unmap_single(cb->pd->device->dma_device,
 					 dma_addr[scnt], cb->size,
 					 DMA_BIDIRECTIONAL);
 			kfree(buf[scnt]);
 			DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
 		}
 	}
 	DEBUG_LOG(cb, "destroying fr page lists!\n");
 	for (scnt = 0; scnt < depth; scnt++) {
 		if (pl[scnt]) {
 			DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
 			ib_free_fast_reg_page_list(pl[scnt]);
 		}
 	}
 err1:
 	if (pl)
 		kfree(pl);
 	if (mr)
 		kfree(mr);
 	if (fr)
 		kfree(fr);
 	if (write)
 		kfree(write);
 	if (inv)
 		kfree(inv);
 	if (sgl)
 		kfree(sgl);
 	if (buf)
 		kfree(buf);
 	if (dma_addr)
 		kfree(dma_addr);
 }
 
 static void krping_fr_test6_server(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	/* Spin waiting for client's Start STAG/TO/Len */
 	while (cb->state < RDMA_READ_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
 		  cb->remote_rkey, (uintmax_t)cb->remote_addr);
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completiong error %d\n", wc.status);
 		return;
 	}
 
 	if (cb->duplex)
 		krping_fr_test6(cb);
 	DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
 	wait_event_interruptible(cb->sem, cb->state == ERROR);
 }
 
 static void krping_fr_test6_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to server */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 	DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
 	    (uintmax_t)cb->remote_addr);
 
 	return krping_fr_test6(cb);
 }
 
 static void krping_run_server(struct krping_cb *cb)
 {
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	ret = krping_bind_server(cb);
 	if (ret)
 		return;
 
 	ret = krping_setup_qp(cb, cb->child_cm_id);
 	if (ret) {
 		PRINTF(cb, "setup_qp failed: %d\n", ret);
 		goto err0;
 	}
 
 	ret = krping_setup_buffers(cb);
 	if (ret) {
 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
 		goto err1;
 	}
 
 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
 		goto err2;
 	}
 
 	ret = krping_accept(cb);
 	if (ret) {
 		PRINTF(cb, "connect error %d\n", ret);
 		goto err2;
 	}
 
 	if (cb->wlat)
 		krping_wlat_test_server(cb);
 	else if (cb->rlat)
 		krping_rlat_test_server(cb);
 	else if (cb->bw)
 		krping_bw_test_server(cb);
 	else if (cb->frtest) {
 		switch (cb->testnum) {
 		case 1:
 		case 2:
 		case 3:
 		case 4:
 			krping_fr_test_server(cb);
 			break;
 		case 5:
 			krping_fr_test5_server(cb);
 			break;
 		case 6:
 			krping_fr_test6_server(cb);
 			break;
 		default:
 			PRINTF(cb, "unknown fr test %d\n", cb->testnum);
 			goto err2;
 			break;
 		}
 	} else
 		krping_test_server(cb);
 	rdma_disconnect(cb->child_cm_id);
 err2:
 	krping_free_buffers(cb);
 err1:
 	krping_free_qp(cb);
 err0:
 	rdma_destroy_id(cb->child_cm_id);
 }
 
 static void krping_test_client(struct krping_cb *cb)
 {
 	int ping, start, cc, i, ret;
 	struct ib_send_wr *bad_wr;
 	unsigned char c;
 
 	start = 65;
 	for (ping = 0; !cb->count || ping < cb->count; ping++) {
 		cb->state = RDMA_READ_ADV;
 
 		/* Put some ascii text in the buffer. */
 		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
 		for (i = cc, c = start; i < cb->size; i++) {
 			cb->start_buf[i] = c;
 			c++;
 			if (c > 122)
 				c = 65;
 		}
 		start++;
 		if (start > 122)
 			start = 65;
 		cb->start_buf[cb->size - 1] = 0;
 
 		krping_format_send(cb, cb->start_dma_addr);
 		if (cb->state == ERROR) {
 			PRINTF(cb, "krping_format_send failed\n");
 			break;
 		}
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for server to ACK */
 		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
 		if (cb->state != RDMA_WRITE_ADV) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_ADV state %d\n",
 			       cb->state);
 			break;
 		}
 
 		krping_format_send(cb, cb->rdma_dma_addr);
 		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 		if (ret) {
 			PRINTF(cb, "post send error %d\n", ret);
 			break;
 		}
 
 		/* Wait for the server to say the RDMA Write is complete. */
 		wait_event_interruptible(cb->sem, 
 					 cb->state >= RDMA_WRITE_COMPLETE);
 		if (cb->state != RDMA_WRITE_COMPLETE) {
 			PRINTF(cb, 
 			       "wait for RDMA_WRITE_COMPLETE state %d\n",
 			       cb->state);
 			break;
 		}
 
 		if (cb->validate)
 			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
 				PRINTF(cb, "data mismatch!\n");
 				break;
 			}
 
 		if (cb->verbose) {
 			if (strlen(cb->rdma_buf) > 128) {
 				char msgbuf[128];
 
 				strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
 				PRINTF(cb, "ping data stripped: %s\n",
 				       msgbuf);
 			} else
 				PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
 		}
 #ifdef SLOW_KRPING
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 #endif
 	}
 }
 
 static void krping_rlat_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 #if 0
 {
 	int i;
 	struct timeval start, stop;
 	time_t sec;
 	suseconds_t usec;
 	unsigned long long elapsed;
 	struct ib_wc wc;
 	struct ib_send_wr *bad_wr;
 	int ne;
 	
 	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
 	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
 	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
 	cb->rdma_sq_wr.sg_list->length = 0;
 	cb->rdma_sq_wr.num_sge = 0;
 
 	microtime(&start);
 	for (i=0; i < 100000; i++) {
 		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
 			PRINTF(cb, "Couldn't post send\n");
 			return;
 		}
 		do {
 			ne = ib_poll_cq(cb->cq, 1, &wc);
 		} while (ne == 0);
 		if (ne < 0) {
 			PRINTF(cb, "poll CQ failed %d\n", ne);
 			return;
 		}
 		if (wc.status != IB_WC_SUCCESS) {
 			PRINTF(cb, "Completion wth error at %s:\n",
 				cb->server ? "server" : "client");
 			PRINTF(cb, "Failed status %d: wr_id %d\n",
 				wc.status, (int) wc.wr_id);
 			return;
 		}
 	}
 	microtime(&stop);
 	
 	if (stop.tv_usec < start.tv_usec) {
 		stop.tv_usec += 1000000;
 		stop.tv_sec  -= 1;
 	}
 	sec     = stop.tv_sec - start.tv_sec;
 	usec    = stop.tv_usec - start.tv_usec;
 	elapsed = sec * 1000000 + usec;
 	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
 }
 #endif
 
 	rlat_test(cb);
 }
 
 static void krping_wlat_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	wlat_test(cb);
 }
 
 static void krping_bw_test_client(struct krping_cb *cb)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_wc wc;
 	int ret;
 
 	cb->state = RDMA_READ_ADV;
 
 	/* Send STAG/TO/Len to client */
 	krping_format_send(cb, cb->start_dma_addr);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "krping_format_send failed\n");
 		return;
 	}
 	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "post send error %d\n", ret);
 		return;
 	}
 
 	/* Spin waiting for send completion */
 	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
 	if (ret < 0) {
 		PRINTF(cb, "poll error %d\n", ret);
 		return;
 	}
 	if (wc.status) {
 		PRINTF(cb, "send completion error %d\n", wc.status);
 		return;
 	}
 
 	/* Spin waiting for server's Start STAG/TO/Len */
 	while (cb->state < RDMA_WRITE_ADV) {
 		krping_cq_event_handler(cb->cq, cb);
 	}
 
 	bw_test(cb);
 }
 
 
 /*
  * fastreg 2 valid different mrs and verify the completions.
  */
 static void krping_fr_test1(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1, *mr2;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 	mr2 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr2)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err2;
 	}
 
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	fr.wr.fast_reg.rkey = mr2->rkey;
 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 2);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr2!\n");
 
 	ib_dereg_mr(mr2);
 err2:
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg the same mr twice, 2nd one should produce error cqe.
  */
 static void krping_fr_test2(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 2);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg pipelined in a loop as fast as we can until the user interrupts.
  * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
  */
 static void krping_fr_test3(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, inv, *bad;
 	struct ib_wc wc;
 	u8 key = 0;
 	struct ib_mr *mr;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	unsigned long start;
 	int count = 0;
 	int scnt = 0;
 
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 	
 	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 	
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.next = &inv;
 	memset(&inv, 0, sizeof inv);
 	inv.opcode = IB_WR_LOCAL_INV;
 	inv.send_flags = IB_SEND_SIGNALED;
 	
 	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
 	start = time_uptime;
 	while (1) {
 		if ((time_uptime - start) >= 9) {
 			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
 			wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 			if (cb->state == ERROR)
 				break;
 			start = time_uptime;
 		}	
 		while (scnt < (cb->txdepth>>1)) {
 			ib_update_fast_reg_key(mr, ++key);
 			fr.wr.fast_reg.rkey = mr->rkey;
 			inv.ex.invalidate_rkey = mr->rkey;
 			size = arc4random() % cb->size;
 			if (size == 0)
 				size = cb->size;
 			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 			fr.wr.fast_reg.length = size;
 			fr.wr.fast_reg.page_list_len = plen;
 			ret = ib_post_send(cb->qp, &fr, &bad);
 			if (ret) {
 				PRINTF(cb, "ib_post_send failed %d\n", ret);
 				goto err2;	
 			}
 			scnt+=2;
 		}
 
 		do {
 			ret = ib_poll_cq(cb->cq, 1, &wc);
 			if (ret < 0) {
 				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 				goto err2;	
 			}
 			if (ret == 1) {
 				if (wc.status) {
 					PRINTF(cb, "completion error %u\n", wc.status);
 					goto err2;
 				}
 				count++;
 				scnt--;
 			}
 			else if (krping_sigpending()) {
 				PRINTF(cb, "signal!\n");
 				goto err2;
 			}
 		} while (ret == 1);
 	}
 err2:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			if (wc.status) {
 				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
 			}
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "fr_test: done!\n");
 	ib_dereg_mr(mr);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 /*
  * fastreg 1 and invalidate 1 mr and verify completion.
  */
 static void krping_fr_test4(struct krping_cb *cb)
 {
 	struct ib_fast_reg_page_list *pl;
 	struct ib_send_wr fr, inv, *bad;
 	struct ib_wc wc;
 	struct ib_mr *mr1;
 	int i;
 	int ret;
 	int size = cb->size;
 	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
 	int count = 0;
 
 	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
 	if (IS_ERR(pl)) {
 		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
 		return;
 	}
 
 	mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
 	if (IS_ERR(mr1)) {
 		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
 		goto err1;
 	}
 
 	for (i=0; i<plen; i++)
 		pl->page_list[i] = i * PAGE_SIZE;
 
 	memset(&fr, 0, sizeof fr);
 	fr.opcode = IB_WR_FAST_REG_MR;
 	fr.wr_id = 1;
 	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	fr.wr.fast_reg.length = size;
 	fr.wr.fast_reg.page_list = pl;
 	fr.wr.fast_reg.page_list_len = plen;
 	fr.wr.fast_reg.iova_start = 0;
 	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
 	fr.send_flags = IB_SEND_SIGNALED;
 	fr.wr.fast_reg.rkey = mr1->rkey;
 	fr.next = &inv;
 	memset(&inv, 0, sizeof inv);
 	inv.opcode = IB_WR_LOCAL_INV;
 	inv.ex.invalidate_rkey = mr1->rkey;
 
 	DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
 	ret = ib_post_send(cb->qp, &fr, &bad);
 	if (ret) {
 		PRINTF(cb, "ib_post_send failed %d\n", ret);
 		goto err3;
 	}
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			goto err3;
 		}
 		if (ret == 1) {
 			DEBUG_LOG(cb, "completion status %u wr %s\n",
 				  wc.status, wc.wr_id == 1 ? "fr" : "inv");
 			count++;
 		} else if (krping_sigpending()) {
 			PRINTF(cb, "signal!\n");
 			goto err3;
 		}
 		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	} while (count != 1);
 err3:
 	DEBUG_LOG(cb, "sleeping 1 second\n");
 	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
 	DEBUG_LOG(cb, "draining the cq...\n");
 	do {
 		ret = ib_poll_cq(cb->cq, 1, &wc);
 		if (ret < 0) {
 			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
 			break;
 		}
 		if (ret == 1) {
 			PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
 		}
 	} while (ret == 1);
 	DEBUG_LOG(cb, "destroying fr mr1!\n");
 	ib_dereg_mr(mr1);
 err1:
 	DEBUG_LOG(cb, "destroying fr page list!\n");
 	ib_free_fast_reg_page_list(pl);
 	DEBUG_LOG(cb, "%s done!\n", __func__);
 }
 
 static void krping_fr_test(struct krping_cb *cb)
 {
 	switch (cb->testnum) {
 	case 1:
 		krping_fr_test1(cb);
 		break;
 	case 2:
 		krping_fr_test2(cb);
 		break;
 	case 3:
 		krping_fr_test3(cb);
 		break;
 	case 4:
 		krping_fr_test4(cb);
 		break;
 	case 5:
 		krping_fr_test5_client(cb);
 		break;
 	case 6:
 		krping_fr_test6_client(cb);
 		break;
 	default:
 		PRINTF(cb, "Unkown frtest num %u\n", cb->testnum);
 		break;
 	}
 }
 
 static int krping_connect_client(struct krping_cb *cb)
 {
 	struct rdma_conn_param conn_param;
 	int ret;
 
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 	conn_param.retry_count = 10;
 
 	ret = rdma_connect(cb->cm_id, &conn_param);
 	if (ret) {
 		PRINTF(cb, "rdma_connect error %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
 	if (cb->state == ERROR) {
 		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
 		return -1;
 	}
 
 	DEBUG_LOG(cb, "rdma_connect successful\n");
 	return 0;
 }
 
 static int krping_bind_client(struct krping_cb *cb)
 {
 	struct sockaddr_in sin;
 	int ret;
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof sin;
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = cb->addr.s_addr;
 	sin.sin_port = cb->port;
 
 	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
 				2000);
 	if (ret) {
 		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
 		return ret;
 	}
 
 	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
 	if (cb->state != ROUTE_RESOLVED) {
 		PRINTF(cb, 
 		       "addr/route resolution did not resolve: state %d\n",
 		       cb->state);
 		return -EINTR;
 	}
 
 	if (cb->mem == FASTREG && !fastreg_supported(cb, 0))
 		return -EINVAL;
 
 	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
 	return 0;
 }
 
 static void krping_run_client(struct krping_cb *cb)
 {
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
 	ret = krping_bind_client(cb);
 	if (ret)
 		return;
 
 	ret = krping_setup_qp(cb, cb->cm_id);
 	if (ret) {
 		PRINTF(cb, "setup_qp failed: %d\n", ret);
 		return;
 	}
 
 	ret = krping_setup_buffers(cb);
 	if (ret) {
 		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
 		goto err1;
 	}
 
 	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
 	if (ret) {
 		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
 		goto err2;
 	}
 
 	ret = krping_connect_client(cb);
 	if (ret) {
 		PRINTF(cb, "connect error %d\n", ret);
 		goto err2;
 	}
 
 	if (cb->wlat)
 		krping_wlat_test_client(cb);
 	else if (cb->rlat)
 		krping_rlat_test_client(cb);
 	else if (cb->bw)
 		krping_bw_test_client(cb);
 	else if (cb->frtest)
 		krping_fr_test(cb);
 	else
 		krping_test_client(cb);
 	rdma_disconnect(cb->cm_id);
 err2:
 	krping_free_buffers(cb);
 err1:
 	krping_free_qp(cb);
 }
 
 int krping_doit(char *cmd, void *cookie)
 {
 	struct krping_cb *cb;
 	int op;
 	int ret = 0;
 	char *optarg;
 	unsigned long optint;
 
 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
 	if (!cb)
 		return -ENOMEM;
 
 	mutex_lock(&krping_mutex);
 	list_add_tail(&cb->list, &krping_cbs);
 	mutex_unlock(&krping_mutex);
 
 	cb->cookie = cookie;
 	cb->server = -1;
 	cb->state = IDLE;
 	cb->size = 64;
 	cb->txdepth = RPING_SQ_DEPTH;
 	cb->mem = DMA;
 	init_waitqueue_head(&cb->sem);
 
 	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
 			      &optint)) != 0) {
 		switch (op) {
 		case 'a':
 			cb->addr_str = optarg;
 			DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
 			if (!inet_aton(optarg, &cb->addr)) {
 				PRINTF(cb, "bad addr string %s\n",
 				    optarg);
 				ret = EINVAL;
 			}
 			break;
 		case 'p':
 			cb->port = htons(optint);
 			DEBUG_LOG(cb, "port %d\n", (int)optint);
 			break;
 		case 'P':
 			cb->poll = 1;
 			DEBUG_LOG(cb, "server\n");
 			break;
 		case 's':
 			cb->server = 1;
 			DEBUG_LOG(cb, "server\n");
 			break;
 		case 'c':
 			cb->server = 0;
 			DEBUG_LOG(cb, "client\n");
 			break;
 		case 'S':
 			cb->size = optint;
 			if ((cb->size < 1) ||
 			    (cb->size > RPING_BUFSIZE)) {
 				PRINTF(cb, "Invalid size %d "
 				       "(valid range is 1 to %d)\n",
 				       cb->size, RPING_BUFSIZE);
 				ret = EINVAL;
 			} else
 				DEBUG_LOG(cb, "size %d\n", (int)optint);
 			break;
 		case 'C':
 			cb->count = optint;
 			if (cb->count < 0) {
 				PRINTF(cb, "Invalid count %d\n",
 					cb->count);
 				ret = EINVAL;
 			} else
 				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
 			break;
 		case 'v':
 			cb->verbose++;
 			DEBUG_LOG(cb, "verbose\n");
 			break;
 		case 'V':
 			cb->validate++;
 			DEBUG_LOG(cb, "validate data\n");
 			break;
 		case 'l':
 			cb->wlat++;
 			break;
 		case 'L':
 			cb->rlat++;
 			break;
 		case 'B':
 			cb->bw++;
 			break;
 		case 'd':
 			cb->duplex++;
 			break;
 		case 'm':
 			if (!strncmp(optarg, "dma", 3))
 				cb->mem = DMA;
 			else if (!strncmp(optarg, "fastreg", 7))
 				cb->mem = FASTREG;
 			else if (!strncmp(optarg, "mw", 2))
 				cb->mem = MW;
 			else if (!strncmp(optarg, "mr", 2))
 				cb->mem = MR;
 			else {
 				PRINTF(cb, "unknown mem mode %s.  "
 					"Must be dma, fastreg, mw, or mr\n",
 					optarg);
 				ret = -EINVAL;
 				break;
 			}
 			break;
 		case 'I':
 			cb->server_invalidate = 1;
 			break;
 		case 'T':
 			cb->txdepth = optint;
 			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
 			break;
 		case 'Z':
 			cb->local_dma_lkey = 1;
 			DEBUG_LOG(cb, "using local dma lkey\n");
 			break;
 		case 'R':
 			cb->read_inv = 1;
 			DEBUG_LOG(cb, "using read-with-inv\n");
 			break;
 		case 'f':
 			cb->frtest = 1;
 			cb->testnum = optint;
 			DEBUG_LOG(cb, "fast-reg test!\n");
 			break;
 		default:
 			PRINTF(cb, "unknown opt %s\n", optarg);
 			ret = -EINVAL;
 			break;
 		}
 	}
 	if (ret)
 		goto out;
 
 	if (cb->server == -1) {
 		PRINTF(cb, "must be either client or server\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
 		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
 		ret = -EINVAL;
 		goto out;
 	}
 	if (cb->server_invalidate && cb->mem != FASTREG) {
 		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (cb->read_inv && cb->mem != FASTREG) {
 		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) {
 		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(cb->cm_id)) {
 		ret = PTR_ERR(cb->cm_id);
 		PRINTF(cb, "rdma_create_id error %d\n", ret);
 		goto out;
 	}
 	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
 
 	if (cb->server)
 		krping_run_server(cb);
 	else
 		krping_run_client(cb);
 
 	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
 	rdma_destroy_id(cb->cm_id);
 out:
 	mutex_lock(&krping_mutex);
 	list_del(&cb->list);
 	mutex_unlock(&krping_mutex);
 	kfree(cb);
 	return ret;
 }
 
 void
 krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
 {
 	struct krping_cb *cb;
 
 	mutex_lock(&krping_mutex);
 	list_for_each_entry(cb, &krping_cbs, list)
 	    (*f)(cb->pd ? &cb->stats : NULL, arg);
 	mutex_unlock(&krping_mutex);
 }
 
 void krping_init(void)
 {
 
 	mutex_init(&krping_mutex);
 }
Index: head/sys/ofed/drivers/infiniband/core/fmr_pool.c
===================================================================
--- head/sys/ofed/drivers/infiniband/core/fmr_pool.c	(revision 320071)
+++ head/sys/ofed/drivers/infiniband/core/fmr_pool.c	(revision 320072)
@@ -1,544 +1,545 @@
 /*
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <linux/errno.h>
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/jhash.h>
 #include <linux/kthread.h>
+#include <linux/wait.h>
 
 #include <rdma/ib_fmr_pool.h>
 
 #include "core_priv.h"
 
 #define PFX "fmr_pool: "
 
 enum {
 	IB_FMR_MAX_REMAPS = 32,
 
 	IB_FMR_HASH_BITS  = 8,
 	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
 	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
 };
 
 /*
  * If an FMR is not in use, then the list member will point to either
  * its pool's free_list (if the FMR can be mapped again; that is,
  * remap_count < pool->max_remaps) or its pool's dirty_list (if the
  * FMR needs to be unmapped before being remapped).  In either of
  * these cases it is a bug if the ref_count is not 0.  In other words,
  * if ref_count is > 0, then the list member must not be linked into
  * either free_list or dirty_list.
  *
  * The cache_node member is used to link the FMR into a cache bucket
  * (if caching is enabled).  This is independent of the reference
  * count of the FMR.  When a valid FMR is released, its ref_count is
  * decremented, and if ref_count reaches 0, the FMR is placed in
  * either free_list or dirty_list as appropriate.  However, it is not
  * removed from the cache and may be "revived" if a call to
  * ib_fmr_register_physical() occurs before the FMR is remapped.  In
  * this case we just increment the ref_count and remove the FMR from
  * free_list/dirty_list.
  *
  * Before we remap an FMR from free_list, we remove it from the cache
  * (to prevent another user from obtaining a stale FMR).  When an FMR
  * is released, we add it to the tail of the free list, so that our
  * cache eviction policy is "least recently used."
  *
  * All manipulation of ref_count, list and cache_node is protected by
  * pool_lock to maintain consistency.
  */
 
 struct ib_fmr_pool {
 	spinlock_t                pool_lock;
 
 	int                       pool_size;
 	int                       max_pages;
 	int			  max_remaps;
 	int                       dirty_watermark;
 	int                       dirty_len;
 	struct list_head          free_list;
 	struct list_head          dirty_list;
 	struct hlist_head        *cache_bucket;
 
 	void                     (*flush_function)(struct ib_fmr_pool *pool,
 						   void *              arg);
 	void                     *flush_arg;
 
 	struct task_struct       *thread;
 
 	atomic_t                  req_ser;
 	atomic_t                  flush_ser;
 
 	wait_queue_head_t         force_wait;
 };
 
 static inline u32 ib_fmr_hash(u64 first_page)
 {
 	return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
 		(IB_FMR_HASH_SIZE - 1);
 }
 
 /* Caller must hold pool_lock */
 static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
 						      u64 *page_list,
 						      int  page_list_len,
 						      u64  io_virtual_address)
 {
 	struct hlist_head *bucket;
 	struct ib_pool_fmr *fmr;
 
 	if (!pool->cache_bucket)
 		return NULL;
 
 	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
 
 	hlist_for_each_entry(fmr, bucket, cache_node)
 		if (io_virtual_address == fmr->io_virtual_address &&
 		    page_list_len      == fmr->page_list_len      &&
 		    !memcmp(page_list, fmr->page_list,
 			    page_list_len * sizeof *page_list))
 			return fmr;
 
 	return NULL;
 }
 
 static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
 {
 	int                 ret;
 	struct ib_pool_fmr *fmr;
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(fmr_list);
 
 	spin_lock_irq(&pool->pool_lock);
 
 	list_for_each_entry(fmr, &pool->dirty_list, list) {
 		hlist_del_init(&fmr->cache_node);
 		fmr->remap_count = 0;
 		list_add_tail(&fmr->fmr->list, &fmr_list);
 
 #ifdef DEBUG
 		if (fmr->ref_count !=0) {
 			printk(KERN_WARNING PFX "Unmapping FMR %p with ref count %d\n",
 			       fmr, fmr->ref_count);
 		}
 #endif
 	}
 
 	list_splice_init(&pool->dirty_list, &unmap_list);
 	pool->dirty_len = 0;
 
 	spin_unlock_irq(&pool->pool_lock);
 
 	if (list_empty(&unmap_list)) {
 		return;
 	}
 
 	ret = ib_unmap_fmr(&fmr_list);
 	if (ret)
 		printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
 
 	spin_lock_irq(&pool->pool_lock);
 	list_splice(&unmap_list, &pool->free_list);
 	spin_unlock_irq(&pool->pool_lock);
 }
 
 static int ib_fmr_cleanup_thread(void *pool_ptr)
 {
 	struct ib_fmr_pool *pool = pool_ptr;
 
 	do {
 		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
 			ib_fmr_batch_release(pool);
 
 			atomic_inc(&pool->flush_ser);
 			wake_up_interruptible(&pool->force_wait);
 
 			if (pool->flush_function)
 				pool->flush_function(pool, pool->flush_arg);
 		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
 		    !kthread_should_stop())
 			schedule();
 		__set_current_state(TASK_RUNNING);
 	} while (!kthread_should_stop());
 
 	return 0;
 }
 
 /**
  * ib_create_fmr_pool - Create an FMR pool
  * @pd:Protection domain for FMRs
  * @params:FMR pool parameters
  *
  * Create a pool of FMRs.  Return value is pointer to new pool or
  * error code if creation failed.
  */
 struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 				       struct ib_fmr_pool_param *params)
 {
 	struct ib_device   *device;
 	struct ib_fmr_pool *pool;
 	struct ib_device_attr *attr;
 	int i;
 	int ret;
 	int max_remaps;
 
 	if (!params)
 		return ERR_PTR(-EINVAL);
 
 	device = pd->device;
 	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
 	    !device->map_phys_fmr || !device->unmap_fmr) {
 		printk(KERN_INFO PFX "Device %s does not support FMRs\n",
 		       device->name);
 		return ERR_PTR(-ENOSYS);
 	}
 
 	attr = kmalloc(sizeof *attr, GFP_KERNEL);
 	if (!attr) {
 		printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
 		return ERR_PTR(-ENOMEM);
 	}
 
 	ret = ib_query_device(device, attr);
 	if (ret) {
 		printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
 		kfree(attr);
 		return ERR_PTR(ret);
 	}
 
 	if (!attr->max_map_per_fmr)
 		max_remaps = IB_FMR_MAX_REMAPS;
 	else
 		max_remaps = attr->max_map_per_fmr;
 
 	kfree(attr);
 
 	pool = kmalloc(sizeof *pool, GFP_KERNEL);
 	if (!pool) {
 		printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
 		return ERR_PTR(-ENOMEM);
 	}
 
 	pool->cache_bucket   = NULL;
 
 	pool->flush_function = params->flush_function;
 	pool->flush_arg      = params->flush_arg;
 
 	INIT_LIST_HEAD(&pool->free_list);
 	INIT_LIST_HEAD(&pool->dirty_list);
 
 	if (params->cache) {
 		pool->cache_bucket =
 			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
 				GFP_KERNEL);
 		if (!pool->cache_bucket) {
 			printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
 			ret = -ENOMEM;
 			goto out_free_pool;
 		}
 
 		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
 			INIT_HLIST_HEAD(pool->cache_bucket + i);
 	}
 
 	pool->pool_size       = 0;
 	pool->max_pages       = params->max_pages_per_fmr;
 	pool->max_remaps      = max_remaps;
 	pool->dirty_watermark = params->dirty_watermark;
 	pool->dirty_len       = 0;
 	spin_lock_init(&pool->pool_lock);
 	atomic_set(&pool->req_ser,   0);
 	atomic_set(&pool->flush_ser, 0);
 	init_waitqueue_head(&pool->force_wait);
 
 	pool->thread = kthread_run(ib_fmr_cleanup_thread,
 				   pool,
 				   "ib_fmr(%s)",
 				   device->name);
 	if (IS_ERR(pool->thread)) {
 		printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
 		ret = PTR_ERR(pool->thread);
 		goto out_free_pool;
 	}
 
 	{
 		struct ib_pool_fmr *fmr;
 		struct ib_fmr_attr fmr_attr = {
 			.max_pages  = params->max_pages_per_fmr,
 			.max_maps   = pool->max_remaps,
 			.page_shift = params->page_shift
 		};
 		int bytes_per_fmr = sizeof *fmr;
 
 		if (pool->cache_bucket)
 			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
 
 		for (i = 0; i < params->pool_size; ++i) {
 			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
 			if (!fmr) {
 				printk(KERN_WARNING PFX "failed to allocate fmr "
 				       "struct for FMR %d\n", i);
 				goto out_fail;
 			}
 
 			fmr->pool             = pool;
 			fmr->remap_count      = 0;
 			fmr->ref_count        = 0;
 			INIT_HLIST_NODE(&fmr->cache_node);
 
 			fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
 			if (IS_ERR(fmr->fmr)) {
 				printk(KERN_WARNING PFX "fmr_create failed "
 				       "for FMR %d\n", i);
 				kfree(fmr);
 				goto out_fail;
 			}
 
 			list_add_tail(&fmr->list, &pool->free_list);
 			++pool->pool_size;
 		}
 	}
 
 	return pool;
 
  out_free_pool:
 	kfree(pool->cache_bucket);
 	kfree(pool);
 
 	return ERR_PTR(ret);
 
  out_fail:
 	ib_destroy_fmr_pool(pool);
 
 	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(ib_create_fmr_pool);
 
 /**
  * ib_destroy_fmr_pool - Free FMR pool
  * @pool:FMR pool to free
  *
  * Destroy an FMR pool and free all associated resources.
  */
 void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
 {
 	struct ib_pool_fmr *fmr;
 	struct ib_pool_fmr *tmp;
 	LIST_HEAD(fmr_list);
 	int                 i;
 
 	kthread_stop(pool->thread);
 	ib_fmr_batch_release(pool);
 
 	i = 0;
 	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
 		if (fmr->remap_count) {
 			INIT_LIST_HEAD(&fmr_list);
 			list_add_tail(&fmr->fmr->list, &fmr_list);
 			ib_unmap_fmr(&fmr_list);
 		}
 		ib_dealloc_fmr(fmr->fmr);
 		list_del(&fmr->list);
 		kfree(fmr);
 		++i;
 	}
 
 	if (i < pool->pool_size)
 		printk(KERN_WARNING PFX "pool still has %d regions registered\n",
 		       pool->pool_size - i);
 
 	kfree(pool->cache_bucket);
 	kfree(pool);
 }
 EXPORT_SYMBOL(ib_destroy_fmr_pool);
 
 /**
  * ib_flush_fmr_pool - Invalidate all unmapped FMRs
  * @pool:FMR pool to flush
  *
  * Ensure that all unmapped FMRs are fully invalidated.
  */
 int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
 {
 	int serial;
 	struct ib_pool_fmr *fmr, *next;
 
 	/*
 	 * The free_list holds FMRs that may have been used
 	 * but have not been remapped enough times to be dirty.
 	 * Put them on the dirty list now so that the cleanup
 	 * thread will reap them too.
 	 */
 	spin_lock_irq(&pool->pool_lock);
 	list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
 		if (fmr->remap_count > 0)
 			list_move(&fmr->list, &pool->dirty_list);
 	}
 	spin_unlock_irq(&pool->pool_lock);
 
 	serial = atomic_inc_return(&pool->req_ser);
 	wake_up_process(pool->thread);
 
 	if (wait_event_interruptible(pool->force_wait,
 				     atomic_read(&pool->flush_ser) - serial >= 0))
 		return -EINTR;
 
 	return 0;
 }
 EXPORT_SYMBOL(ib_flush_fmr_pool);
 
 /**
  * ib_fmr_pool_map_phys -
  * @pool:FMR pool to allocate FMR from
  * @page_list:List of pages to map
  * @list_len:Number of pages in @page_list
  * @io_virtual_address:I/O virtual address for new FMR
  *
  * Map an FMR from an FMR pool.
  */
 struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
 					 u64                *page_list,
 					 int                 list_len,
 					 u64                 io_virtual_address)
 {
 	struct ib_fmr_pool *pool = pool_handle;
 	struct ib_pool_fmr *fmr;
 	unsigned long       flags;
 	int                 result;
 
 	if (list_len < 1 || list_len > pool->max_pages)
 		return ERR_PTR(-EINVAL);
 
 	spin_lock_irqsave(&pool->pool_lock, flags);
 	fmr = ib_fmr_cache_lookup(pool,
 				  page_list,
 				  list_len,
 				  io_virtual_address);
 	if (fmr) {
 		/* found in cache */
 		++fmr->ref_count;
 		if (fmr->ref_count == 1) {
 			list_del(&fmr->list);
 		}
 
 		spin_unlock_irqrestore(&pool->pool_lock, flags);
 
 		return fmr;
 	}
 
 	if (list_empty(&pool->free_list)) {
 		spin_unlock_irqrestore(&pool->pool_lock, flags);
 		return ERR_PTR(-EAGAIN);
 	}
 
 	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
 	list_del(&fmr->list);
 	hlist_del_init(&fmr->cache_node);
 	spin_unlock_irqrestore(&pool->pool_lock, flags);
 
 	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
 				 io_virtual_address);
 
 	if (result) {
 		spin_lock_irqsave(&pool->pool_lock, flags);
 		list_add(&fmr->list, &pool->free_list);
 		spin_unlock_irqrestore(&pool->pool_lock, flags);
 
 		printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
 
 		return ERR_PTR(result);
 	}
 
 	++fmr->remap_count;
 	fmr->ref_count = 1;
 
 	if (pool->cache_bucket) {
 		fmr->io_virtual_address = io_virtual_address;
 		fmr->page_list_len      = list_len;
 		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
 
 		spin_lock_irqsave(&pool->pool_lock, flags);
 		hlist_add_head(&fmr->cache_node,
 			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
 		spin_unlock_irqrestore(&pool->pool_lock, flags);
 	}
 
 	return fmr;
 }
 EXPORT_SYMBOL(ib_fmr_pool_map_phys);
 
 /**
  * ib_fmr_pool_unmap - Unmap FMR
  * @fmr:FMR to unmap
  *
  * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
  * reused (or until ib_flush_fmr_pool() is called).
  */
 int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
 {
 	struct ib_fmr_pool *pool;
 	unsigned long flags;
 
 	pool = fmr->pool;
 
 	spin_lock_irqsave(&pool->pool_lock, flags);
 
 	--fmr->ref_count;
 	if (!fmr->ref_count) {
 		if (fmr->remap_count < pool->max_remaps) {
 			list_add_tail(&fmr->list, &pool->free_list);
 		} else {
 			list_add_tail(&fmr->list, &pool->dirty_list);
 			if (++pool->dirty_len >= pool->dirty_watermark) {
 				atomic_inc(&pool->req_ser);
 				wake_up_process(pool->thread);
 			}
 		}
 	}
 
 #ifdef DEBUG
 	if (fmr->ref_count < 0)
 		printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
 		       fmr, fmr->ref_count);
 #endif
 
 	spin_unlock_irqrestore(&pool->pool_lock, flags);
 
 	return 0;
 }
 EXPORT_SYMBOL(ib_fmr_pool_unmap);
Index: head/sys/ofed/drivers/infiniband/core/iwcm.c
===================================================================
--- head/sys/ofed/drivers/infiniband/core/iwcm.c	(revision 320071)
+++ head/sys/ofed/drivers/infiniband/core/iwcm.c	(revision 320072)
@@ -1,1285 +1,1286 @@
 /*
  * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
  * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
  * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
  * Copyright (c) 2016 Chelsio Communications.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  */
 #include "opt_inet.h"
 
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
 #include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/rbtree.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/wait.h>
 #include <netinet/tcp.h>
 #include <sys/mutex.h>
 
 #include <rdma/rdma_cm.h>
 #include <rdma/iw_cm.h>
 #include <rdma/ib_addr.h>
 
 #include "iwcm.h"
 
 MODULE_AUTHOR("Tom Tucker");
 MODULE_DESCRIPTION("iWARP CM");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static struct workqueue_struct *iwcm_wq;
 struct iwcm_work {
 	struct work_struct work;
 	struct iwcm_id_private *cm_id;
 	struct list_head list;
 	struct iw_cm_event event;
 	struct list_head free_list;
 };
 struct iwcm_listen_work {
 	struct work_struct work;
 	struct iw_cm_id *cm_id;
 };
 
 static LIST_HEAD(listen_port_list);
 
 static DEFINE_MUTEX(listen_port_mutex);
 
 struct listen_port_info {
 	struct list_head list;
 	uint16_t port_num;
 	uint32_t refcnt;
 };
 
 static int32_t
 add_port_to_listenlist(uint16_t port)
 {
 	struct listen_port_info *port_info;
 	int err = 0;
 
 	mutex_lock(&listen_port_mutex);
 
 	list_for_each_entry(port_info, &listen_port_list, list)
 		if (port_info->port_num == port)
 			goto found_port;
 
 	port_info = kmalloc(sizeof(*port_info), GFP_KERNEL);
 	if (!port_info) {
 		err = -ENOMEM;
 		mutex_unlock(&listen_port_mutex);
 		goto out;
 	}
 
 	port_info->port_num = port;
 	port_info->refcnt    = 0;
 
 	list_add(&port_info->list, &listen_port_list);
 
 found_port:
 	++(port_info->refcnt);
 	mutex_unlock(&listen_port_mutex);
 	return port_info->refcnt;
 out:
 	return err;
 }
 
 static int32_t
 rem_port_from_listenlist(uint16_t port)
 {
 	struct listen_port_info *port_info;
 	int ret, found_port = 0;
 
 	mutex_lock(&listen_port_mutex);
 
 	list_for_each_entry(port_info, &listen_port_list, list)
 		if (port_info->port_num == port) {
 			found_port = 1;
 			break;
 		}
 
 	if (found_port) {
 		--(port_info->refcnt);
 		ret = port_info->refcnt;
 		if (port_info->refcnt == 0) {
 			/* Remove this entry from the list as there are no
 			 * more listeners for this port_num.
 			 */
 			list_del(&port_info->list);
 			kfree(port_info);
 		}
 	} else {
 		ret = -EINVAL;
 	}
 	mutex_unlock(&listen_port_mutex);
 	return ret;
 
 }
 
 /*
  * The following services provide a mechanism for pre-allocating iwcm_work
  * elements.  The design pre-allocates them  based on the cm_id type:
  *	LISTENING IDS: 	Get enough elements preallocated to handle the
  *			listen backlog.
  *	ACTIVE IDS:	4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
  *	PASSIVE IDS:	3: ESTABLISHED, DISCONNECT, CLOSE
  *
  * Allocating them in connect and listen avoids having to deal
  * with allocation failures on the event upcall from the provider (which
  * is called in the interrupt context).
  *
  * One exception is when creating the cm_id for incoming connection requests.
  * There are two cases:
  * 1) in the event upcall, cm_event_handler(), for a listening cm_id.  If
  *    the backlog is exceeded, then no more connection request events will
  *    be processed.  cm_event_handler() returns -ENOMEM in this case.  Its up
  *    to the provider to reject the connection request.
  * 2) in the connection request workqueue handler, cm_conn_req_handler().
  *    If work elements cannot be allocated for the new connect request cm_id,
  *    then IWCM will call the provider reject method.  This is ok since
  *    cm_conn_req_handler() runs in the workqueue thread context.
  */
 
 static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
 {
 	struct iwcm_work *work;
 
 	if (list_empty(&cm_id_priv->work_free_list))
 		return NULL;
 	work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
 			  free_list);
 	list_del_init(&work->free_list);
 	return work;
 }
 
 static void put_work(struct iwcm_work *work)
 {
 	list_add(&work->free_list, &work->cm_id->work_free_list);
 }
 
 static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
 {
 	struct list_head *e, *tmp;
 
 	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
 		kfree(list_entry(e, struct iwcm_work, free_list));
 }
 
 static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
 {
 	struct iwcm_work *work;
 
 	BUG_ON(!list_empty(&cm_id_priv->work_free_list));
 	while (count--) {
 		work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
 		if (!work) {
 			dealloc_work_entries(cm_id_priv);
 			return -ENOMEM;
 		}
 		work->cm_id = cm_id_priv;
 		INIT_LIST_HEAD(&work->list);
 		put_work(work);
 	}
 	return 0;
 }
 
 /*
  * Save private data from incoming connection requests to
  * iw_cm_event, so the low level driver doesn't have to. Adjust
  * the event ptr to point to the local copy.
  */
 static int copy_private_data(struct iw_cm_event *event)
 {
 	void *p;
 
 	p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
 	if (!p)
 		return -ENOMEM;
 	event->private_data = p;
 	return 0;
 }
 
 static void free_cm_id(struct iwcm_id_private *cm_id_priv)
 {
 	dealloc_work_entries(cm_id_priv);
 	kfree(cm_id_priv);
 }
 
 /*
  * Release a reference on cm_id. If the last reference is being
  * released, enable the waiting thread (in iw_destroy_cm_id) to
  * get woken up, and return 1 if a thread is already waiting.
  */
 static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
 {
 	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
 	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
 		BUG_ON(!list_empty(&cm_id_priv->work_list));
 		complete(&cm_id_priv->destroy_comp);
 		return 1;
 	}
 
 	return 0;
 }
 
 static void add_ref(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	atomic_inc(&cm_id_priv->refcount);
 }
 
 static void rem_ref(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
 	int cb_destroy;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
 	/*
 	 * Test bit before deref in case the cm_id gets freed on another
 	 * thread.
 	 */
 	cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
 	if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
 		BUG_ON(!list_empty(&cm_id_priv->work_list));
 		free_cm_id(cm_id_priv);
 	}
 }
 
 static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
 
 struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
 				 struct socket *so,
 				 iw_cm_handler cm_handler,
 				 void *context)
 {
 	struct iwcm_id_private *cm_id_priv;
 
 	cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
 	if (!cm_id_priv)
 		return ERR_PTR(-ENOMEM);
 
 	cm_id_priv->state = IW_CM_STATE_IDLE;
 	cm_id_priv->id.device = device;
 	cm_id_priv->id.cm_handler = cm_handler;
 	cm_id_priv->id.context = context;
 	cm_id_priv->id.event_handler = cm_event_handler;
 	cm_id_priv->id.add_ref = add_ref;
 	cm_id_priv->id.rem_ref = rem_ref;
 	cm_id_priv->id.so = so;
 	spin_lock_init(&cm_id_priv->lock);
 	atomic_set(&cm_id_priv->refcount, 1);
 	init_waitqueue_head(&cm_id_priv->connect_wait);
 	init_completion(&cm_id_priv->destroy_comp);
 	INIT_LIST_HEAD(&cm_id_priv->work_list);
 	INIT_LIST_HEAD(&cm_id_priv->work_free_list);
 
 	return &cm_id_priv->id;
 }
 EXPORT_SYMBOL(iw_create_cm_id);
 
 
 static int iwcm_modify_qp_err(struct ib_qp *qp)
 {
 	struct ib_qp_attr qp_attr;
 
 	if (!qp)
 		return -EINVAL;
 
 	qp_attr.qp_state = IB_QPS_ERR;
 	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
 }
 
 /*
  * This is really the RDMAC CLOSING state. It is most similar to the
  * IB SQD QP state.
  */
 static int iwcm_modify_qp_sqd(struct ib_qp *qp)
 {
 	struct ib_qp_attr qp_attr;
 
 	BUG_ON(qp == NULL);
 	qp_attr.qp_state = IB_QPS_SQD;
 	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
 }
 
 /*
  * CM_ID <-- CLOSING
  *
  * Block if a passive or active connection is currently being processed. Then
  * process the event as follows:
  * - If we are ESTABLISHED, move to CLOSING and modify the QP state
  *   based on the abrupt flag
  * - If the connection is already in the CLOSING or IDLE state, the peer is
  *   disconnecting concurrently with us and we've already seen the
  *   DISCONNECT event -- ignore the request and return 0
  * - Disconnect on a listening endpoint returns -EINVAL
  */
 int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
 {
 	struct iwcm_id_private *cm_id_priv;
 	unsigned long flags;
 	int ret = 0;
 	struct ib_qp *qp = NULL;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	/* Wait if we're currently in a connect or accept downcall */
 	wait_event(cm_id_priv->connect_wait,
 		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_ESTABLISHED:
 		cm_id_priv->state = IW_CM_STATE_CLOSING;
 
 		/* QP could be <nul> for user-mode client */
 		if (cm_id_priv->qp)
 			qp = cm_id_priv->qp;
 		else
 			ret = -EINVAL;
 		break;
 	case IW_CM_STATE_LISTEN:
 		ret = -EINVAL;
 		break;
 	case IW_CM_STATE_CLOSING:
 		/* remote peer closed first */
 	case IW_CM_STATE_IDLE:
 		/* accept or connect returned !0 */
 		break;
 	case IW_CM_STATE_CONN_RECV:
 		/*
 		 * App called disconnect before/without calling accept after
 		 * connect_request event delivered.
 		 */
 		break;
 	case IW_CM_STATE_CONN_SENT:
 		/* Can only get here if wait above fails */
 	default:
 		BUG();
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	if (qp) {
 		if (abrupt)
 			ret = iwcm_modify_qp_err(qp);
 		else
 			ret = iwcm_modify_qp_sqd(qp);
 
 		/*
 		 * If both sides are disconnecting the QP could
 		 * already be in ERR or SQD states
 		 */
 		ret = 0;
 	}
 
 	return ret;
 }
 EXPORT_SYMBOL(iw_cm_disconnect);
 
 static struct socket *
 dequeue_socket(struct socket *head)
 {
 	struct socket *so;
 	struct sockaddr_in *remote;
 	int error;
 
 	SOLISTEN_LOCK(head);
 	error = solisten_dequeue(head, &so, SOCK_NONBLOCK);
 	if (error == EWOULDBLOCK)
 		return (NULL);
 	remote = NULL;
 	soaccept(so, (struct sockaddr **)&remote);
 
 	free(remote, M_SONAME);
 	return so;
 }
 
 static void
 iw_so_event_handler(struct work_struct *_work)
 {
 #ifdef INET
 	struct	iwcm_listen_work *work = container_of(_work,
 						struct iwcm_listen_work, work);
 	struct	iw_cm_id *listen_cm_id = work->cm_id;
 	struct	iwcm_id_private *cm_id_priv;
 	struct	iw_cm_id *real_cm_id;
 	struct	sockaddr_in *local;
 	struct	socket *so;
 
 	cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id);
 
 	if (cm_id_priv->state != IW_CM_STATE_LISTEN) {
 		kfree(work);
 		return;
 	}
 
 	/* Dequeue & process  all new 'so' connection requests for this cmid */
 	while ((so = dequeue_socket(work->cm_id->so)) != NULL) {
 		if (rdma_cma_any_addr((struct sockaddr *)
 					&listen_cm_id->local_addr)) {
 			in_getsockaddr(so, (struct sockaddr **)&local);
 			if (rdma_find_cmid_laddr(local, ARPHRD_ETHER,
 					(void **) &real_cm_id)) {
 				free(local, M_SONAME);
 				goto err;
 			}
 			free(local, M_SONAME);
 
 			real_cm_id->device->iwcm->newconn(real_cm_id, so);
 		} else {
 			listen_cm_id->device->iwcm->newconn(listen_cm_id, so);
 		}
 	}
 err:
 	kfree(work);
 #endif
 	return;
 }
 
 static int
 iw_so_upcall(struct socket *parent_so, void *arg, int waitflag)
 {
 	struct iwcm_listen_work *work;
 	struct iw_cm_id *cm_id = arg;
 
 	/* check whether iw_so_event_handler() already dequeued this 'so' */
 	if (TAILQ_EMPTY(&parent_so->sol_comp))
 		return SU_OK;
 	work = kzalloc(sizeof(*work), waitflag);
 	if (!work)
 		return -ENOMEM;
 	work->cm_id = cm_id;
 
 	INIT_WORK(&work->work, iw_so_event_handler);
 	queue_work(iwcm_wq, &work->work);
 
 	return SU_OK;
 }
 
 static int
 iw_create_listen(struct iw_cm_id *cm_id, int backlog)
 {
 	struct sockopt sopt;
 	struct socket *so = cm_id->so;
 	int on = 1;
 	int rc;
 
 	rc = -solisten(cm_id->so, backlog, curthread);
 	if (rc != 0)
 		return (rc);
 	SOLISTEN_LOCK(so);
 	solisten_upcall_set(so, iw_so_upcall, cm_id);
 	so->so_state |= SS_NBIO;
 	SOLISTEN_UNLOCK(so);
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = IPPROTO_TCP;
 	sopt.sopt_name = TCP_NODELAY;
 	sopt.sopt_val = (caddr_t)&on;
 	sopt.sopt_valsize = sizeof(on);
 	sopt.sopt_td = NULL;
 	sosetopt(so, &sopt);
 	return (0);
 }
 
 static int
 iw_destroy_listen(struct iw_cm_id *cm_id)
 {
 	struct socket *so = cm_id->so;
 
 	SOLISTEN_LOCK(so);
 	solisten_upcall_set(so, NULL, NULL);
 	SOLISTEN_UNLOCK(so);
 	return (0);
 }
 
 
 /*
  * CM_ID <-- DESTROYING
  *
  * Clean up all resources associated with the connection and release
  * the initial reference taken by iw_create_cm_id.
  */
 static void destroy_cm_id(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
 	unsigned long flags;
 	int ret = 0, refcnt;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	/*
 	 * Wait if we're currently in a connect or accept downcall. A
 	 * listening endpoint should never block here.
 	 */
 	wait_event(cm_id_priv->connect_wait,
 		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_LISTEN:
 		cm_id_priv->state = IW_CM_STATE_DESTROYING;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) {
 			refcnt =
 			  rem_port_from_listenlist(cm_id->local_addr.sin_port);
 
 			if (refcnt == 0)
 				ret = iw_destroy_listen(cm_id);
 
 			cm_id->device->iwcm->destroy_listen_ep(cm_id);
 		} else {
 			ret = iw_destroy_listen(cm_id);
 			cm_id->device->iwcm->destroy_listen_ep(cm_id);
 		}
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	case IW_CM_STATE_ESTABLISHED:
 		cm_id_priv->state = IW_CM_STATE_DESTROYING;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		/* Abrupt close of the connection */
 		(void)iwcm_modify_qp_err(cm_id_priv->qp);
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	case IW_CM_STATE_IDLE:
 	case IW_CM_STATE_CLOSING:
 		cm_id_priv->state = IW_CM_STATE_DESTROYING;
 		break;
 	case IW_CM_STATE_CONN_RECV:
 		/*
 		 * App called destroy before/without calling accept after
 		 * receiving connection request event notification or
 		 * returned non zero from the event callback function.
 		 * In either case, must tell the provider to reject.
 		 */
 		cm_id_priv->state = IW_CM_STATE_DESTROYING;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		cm_id->device->iwcm->reject(cm_id, NULL, 0);
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	case IW_CM_STATE_CONN_SENT:
 	case IW_CM_STATE_DESTROYING:
 	default:
 		BUG();
 		break;
 	}
 	if (cm_id_priv->qp) {
 		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
 		cm_id_priv->qp = NULL;
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	(void)iwcm_deref_id(cm_id_priv);
 }
 
 /*
  * This function is only called by the application thread and cannot
  * be called by the event thread. The function will wait for all
  * references to be released on the cm_id and then kfree the cm_id
  * object.
  */
 void iw_destroy_cm_id(struct iw_cm_id *cm_id)
 {
 	struct iwcm_id_private *cm_id_priv;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
 
 	destroy_cm_id(cm_id);
 
 	wait_for_completion(&cm_id_priv->destroy_comp);
 
 	if (cm_id->so)
 		sock_release(cm_id->so);
 
 	free_cm_id(cm_id_priv);
 }
 EXPORT_SYMBOL(iw_destroy_cm_id);
 
 /*
  * CM_ID <-- LISTEN
  *
  * Start listening for connect requests. Generates one CONNECT_REQUEST
  * event for each inbound connect request.
  */
 int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
 {
 	struct iwcm_id_private *cm_id_priv;
 	unsigned long flags;
 	int ret, refcnt;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
 	ret = alloc_work_entries(cm_id_priv, backlog);
 	if (ret)
 		return ret;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_IDLE:
 		cm_id_priv->state = IW_CM_STATE_LISTEN;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 		if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) {
 			refcnt =
 			  add_port_to_listenlist(cm_id->local_addr.sin_port);
 
 			if (refcnt == 1) {
 				ret = iw_create_listen(cm_id, backlog);
 			} else if (refcnt <= 0) {
 				ret = -EINVAL;
 			} else {
 				/* if refcnt > 1, a socket listener created
 				 * already. And we need not create socket
 				 * listener on other rdma devices/listen cm_id's
 				 * due to TOE. That is when a socket listener is
 				 * created with INADDR_ANY all registered TOE
 				 * devices will get a call to start
 				 * hardware listeners.
 				 */
 			}
 		} else {
 			ret = iw_create_listen(cm_id, backlog);
 		}
 		if (!ret)
 			cm_id->device->iwcm->create_listen_ep(cm_id, backlog);
 		else
 			cm_id_priv->state = IW_CM_STATE_IDLE;
 
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	default:
 		ret = -EINVAL;
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	return ret;
 }
 EXPORT_SYMBOL(iw_cm_listen);
 
 /*
  * CM_ID <-- IDLE
  *
  * Rejects an inbound connection request. No events are generated.
  */
 int iw_cm_reject(struct iw_cm_id *cm_id,
 		 const void *private_data,
 		 u8 private_data_len)
 {
 	struct iwcm_id_private *cm_id_priv;
 	unsigned long flags;
 	int ret;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 		return -EINVAL;
 	}
 	cm_id_priv->state = IW_CM_STATE_IDLE;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	ret = cm_id->device->iwcm->reject(cm_id, private_data,
 					  private_data_len);
 
 	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 	wake_up_all(&cm_id_priv->connect_wait);
 
 	return ret;
 }
 EXPORT_SYMBOL(iw_cm_reject);
 
 /*
  * CM_ID <-- ESTABLISHED
  *
  * Accepts an inbound connection request and generates an ESTABLISHED
  * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
  * until the ESTABLISHED event is received from the provider.
  */
 int iw_cm_accept(struct iw_cm_id *cm_id,
 		 struct iw_cm_conn_param *iw_param)
 {
 	struct iwcm_id_private *cm_id_priv;
 	struct ib_qp *qp;
 	unsigned long flags;
 	int ret;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 		return -EINVAL;
 	}
 	/* Get the ib_qp given the QPN */
 	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
 	if (!qp) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 		return -EINVAL;
 	}
 	cm_id->device->iwcm->add_ref(qp);
 	cm_id_priv->qp = qp;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
 	if (ret) {
 		/* An error on accept precludes provider events */
 		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
 		cm_id_priv->state = IW_CM_STATE_IDLE;
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		if (cm_id_priv->qp) {
 			cm_id->device->iwcm->rem_ref(qp);
 			cm_id_priv->qp = NULL;
 		}
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 	}
 
 	return ret;
 }
 EXPORT_SYMBOL(iw_cm_accept);
 
 /*
  * Active Side: CM_ID <-- CONN_SENT
  *
  * If successful, results in the generation of a CONNECT_REPLY
  * event. iw_cm_disconnect and iw_cm_destroy will block until the
  * CONNECT_REPLY event is received from the provider.
  */
 int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
 {
 	struct iwcm_id_private *cm_id_priv;
 	int ret;
 	unsigned long flags;
 	struct ib_qp *qp;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
 	ret = alloc_work_entries(cm_id_priv, 4);
 	if (ret)
 		return ret;
 
 	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 
 	if (cm_id_priv->state != IW_CM_STATE_IDLE) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 		return -EINVAL;
 	}
 
 	/* Get the ib_qp given the QPN */
 	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
 	if (!qp) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 		return -EINVAL;
 	}
 	cm_id->device->iwcm->add_ref(qp);
 	cm_id_priv->qp = qp;
 	cm_id_priv->state = IW_CM_STATE_CONN_SENT;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	ret = cm_id->device->iwcm->connect(cm_id, iw_param);
 	if (ret) {
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		if (cm_id_priv->qp) {
 			cm_id->device->iwcm->rem_ref(qp);
 			cm_id_priv->qp = NULL;
 		}
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
 		cm_id_priv->state = IW_CM_STATE_IDLE;
 		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 		wake_up_all(&cm_id_priv->connect_wait);
 	}
 
 	return ret;
 }
 EXPORT_SYMBOL(iw_cm_connect);
 
 /*
  * Passive Side: new CM_ID <-- CONN_RECV
  *
  * Handles an inbound connect request. The function creates a new
  * iw_cm_id to represent the new connection and inherits the client
  * callback function and other attributes from the listening parent.
  *
  * The work item contains a pointer to the listen_cm_id and the event. The
  * listen_cm_id contains the client cm_handler, context and
  * device. These are copied when the device is cloned. The event
  * contains the new four tuple.
  *
  * An error on the child should not affect the parent, so this
  * function does not return a value.
  */
 static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
 				struct iw_cm_event *iw_event)
 {
 	unsigned long flags;
 	struct iw_cm_id *cm_id;
 	struct iwcm_id_private *cm_id_priv;
 	int ret;
 
 	/*
 	 * The provider should never generate a connection request
 	 * event with a bad status.
 	 */
 	BUG_ON(iw_event->status);
 
 	cm_id = iw_create_cm_id(listen_id_priv->id.device,
 				iw_event->so,
 				listen_id_priv->id.cm_handler,
 				listen_id_priv->id.context);
 	/* If the cm_id could not be created, ignore the request */
 	if (IS_ERR(cm_id))
 		goto out;
 
 	cm_id->provider_data = iw_event->provider_data;
 	cm_id->local_addr = iw_event->local_addr;
 	cm_id->remote_addr = iw_event->remote_addr;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	cm_id_priv->state = IW_CM_STATE_CONN_RECV;
 
 	/*
 	 * We could be destroying the listening id. If so, ignore this
 	 * upcall.
 	 */
 	spin_lock_irqsave(&listen_id_priv->lock, flags);
 	if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
 		spin_unlock_irqrestore(&listen_id_priv->lock, flags);
 		iw_cm_reject(cm_id, NULL, 0);
 		iw_destroy_cm_id(cm_id);
 		goto out;
 	}
 	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
 
 	ret = alloc_work_entries(cm_id_priv, 3);
 	if (ret) {
 		iw_cm_reject(cm_id, NULL, 0);
 		iw_destroy_cm_id(cm_id);
 		goto out;
 	}
 
 	/* Call the client CM handler */
 	ret = cm_id->cm_handler(cm_id, iw_event);
 	if (ret) {
 		iw_cm_reject(cm_id, NULL, 0);
 		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
 		destroy_cm_id(cm_id);
 		if (atomic_read(&cm_id_priv->refcount)==0)
 			free_cm_id(cm_id_priv);
 	}
 
 out:
 	if (iw_event->private_data_len)
 		kfree(iw_event->private_data);
 }
 
 /*
  * Passive Side: CM_ID <-- ESTABLISHED
  *
  * The provider generated an ESTABLISHED event which means that
  * the MPA negotion has completed successfully and we are now in MPA
  * FPDU mode.
  *
  * This event can only be received in the CONN_RECV state. If the
  * remote peer closed, the ESTABLISHED event would be received followed
  * by the CLOSE event. If the app closes, it will block until we wake
  * it up after processing this event.
  */
 static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
 			       struct iw_cm_event *iw_event)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 
 	/*
 	 * We clear the CONNECT_WAIT bit here to allow the callback
 	 * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
 	 * from a callback handler is not allowed.
 	 */
 	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
 	cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
 	wake_up_all(&cm_id_priv->connect_wait);
 
 	return ret;
 }
 
 /*
  * Active Side: CM_ID <-- ESTABLISHED
  *
  * The app has called connect and is waiting for the established event to
  * post it's requests to the server. This event will wake up anyone
  * blocked in iw_cm_disconnect or iw_destroy_id.
  */
 static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
 			       struct iw_cm_event *iw_event)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	/*
 	 * Clear the connect wait bit so a callback function calling
 	 * iw_cm_disconnect will not wait and deadlock this thread
 	 */
 	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
 	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
 	if (iw_event->status == 0) {
 		cm_id_priv->id.local_addr = iw_event->local_addr;
 		cm_id_priv->id.remote_addr = iw_event->remote_addr;
 		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
 	} else {
 		/* REJECTED or RESET */
 		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
 		cm_id_priv->qp = NULL;
 		cm_id_priv->state = IW_CM_STATE_IDLE;
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
 
 	if (iw_event->private_data_len)
 		kfree(iw_event->private_data);
 
 	/* Wake up waiters on connect complete */
 	wake_up_all(&cm_id_priv->connect_wait);
 
 	return ret;
 }
 
 /*
  * CM_ID <-- CLOSING
  *
  * If in the ESTABLISHED state, move to CLOSING.
  */
 static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
 				  struct iw_cm_event *iw_event)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
 		cm_id_priv->state = IW_CM_STATE_CLOSING;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 }
 
 /*
  * CM_ID <-- IDLE
  *
  * If in the ESTBLISHED or CLOSING states, the QP will have have been
  * moved by the provider to the ERR state. Disassociate the CM_ID from
  * the QP,  move to IDLE, and remove the 'connected' reference.
  *
  * If in some other state, the cm_id was destroyed asynchronously.
  * This is the last reference that will result in waking up
  * the app thread blocked in iw_destroy_cm_id.
  */
 static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
 				  struct iw_cm_event *iw_event)
 {
 	unsigned long flags;
 	int ret = 0;
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 
 	if (cm_id_priv->qp) {
 		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
 		cm_id_priv->qp = NULL;
 	}
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_ESTABLISHED:
 	case IW_CM_STATE_CLOSING:
 		cm_id_priv->state = IW_CM_STATE_IDLE;
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 		break;
 	case IW_CM_STATE_DESTROYING:
 		break;
 	default:
 		BUG();
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	return ret;
 }
 
 static int process_event(struct iwcm_id_private *cm_id_priv,
 			 struct iw_cm_event *iw_event)
 {
 	int ret = 0;
 
 	switch (iw_event->event) {
 	case IW_CM_EVENT_CONNECT_REQUEST:
 		cm_conn_req_handler(cm_id_priv, iw_event);
 		break;
 	case IW_CM_EVENT_CONNECT_REPLY:
 		ret = cm_conn_rep_handler(cm_id_priv, iw_event);
 		break;
 	case IW_CM_EVENT_ESTABLISHED:
 		ret = cm_conn_est_handler(cm_id_priv, iw_event);
 		break;
 	case IW_CM_EVENT_DISCONNECT:
 		cm_disconnect_handler(cm_id_priv, iw_event);
 		break;
 	case IW_CM_EVENT_CLOSE:
 		ret = cm_close_handler(cm_id_priv, iw_event);
 		break;
 	default:
 		BUG();
 	}
 
 	return ret;
 }
 
 /*
  * Process events on the work_list for the cm_id. If the callback
  * function requests that the cm_id be deleted, a flag is set in the
  * cm_id flags to indicate that when the last reference is
  * removed, the cm_id is to be destroyed. This is necessary to
  * distinguish between an object that will be destroyed by the app
  * thread asleep on the destroy_comp list vs. an object destroyed
  * here synchronously when the last reference is removed.
  */
 static void cm_work_handler(struct work_struct *_work)
 {
 	struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
 	struct iw_cm_event levent;
 	struct iwcm_id_private *cm_id_priv = work->cm_id;
 	unsigned long flags;
 	int empty;
 	int ret = 0;
 	int destroy_id;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	empty = list_empty(&cm_id_priv->work_list);
 	while (!empty) {
 		work = list_entry(cm_id_priv->work_list.next,
 				  struct iwcm_work, list);
 		list_del_init(&work->list);
 		empty = list_empty(&cm_id_priv->work_list);
 		levent = work->event;
 		put_work(work);
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 		ret = process_event(cm_id_priv, &levent);
 		if (ret) {
 			set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
 			destroy_cm_id(&cm_id_priv->id);
 		}
 		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
 		destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
 		if (iwcm_deref_id(cm_id_priv)) {
 			if (destroy_id) {
 				BUG_ON(!list_empty(&cm_id_priv->work_list));
 				free_cm_id(cm_id_priv);
 			}
 			return;
 		}
 		if (empty)
 			return;
 		spin_lock_irqsave(&cm_id_priv->lock, flags);
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 }
 
 /*
  * This function is called on interrupt context. Schedule events on
  * the iwcm_wq thread to allow callback functions to downcall into
  * the CM and/or block.  Events are queued to a per-CM_ID
  * work_list. If this is the first event on the work_list, the work
  * element is also queued on the iwcm_wq thread.
  *
  * Each event holds a reference on the cm_id. Until the last posted
  * event has been delivered and processed, the cm_id cannot be
  * deleted.
  *
  * Returns:
  * 	      0	- the event was handled.
  *	-ENOMEM	- the event was not handled due to lack of resources.
  */
 static int cm_event_handler(struct iw_cm_id *cm_id,
 			     struct iw_cm_event *iw_event)
 {
 	struct iwcm_work *work;
 	struct iwcm_id_private *cm_id_priv;
 	unsigned long flags;
 	int ret = 0;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	work = get_work(cm_id_priv);
 	if (!work) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	INIT_WORK(&work->work, cm_work_handler);
 	work->cm_id = cm_id_priv;
 	work->event = *iw_event;
 
 	if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
 	     work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
 	    work->event.private_data_len) {
 		ret = copy_private_data(&work->event);
 		if (ret) {
 			put_work(work);
 			goto out;
 		}
 	}
 
 	atomic_inc(&cm_id_priv->refcount);
 	if (list_empty(&cm_id_priv->work_list)) {
 		list_add_tail(&work->list, &cm_id_priv->work_list);
 		queue_work(iwcm_wq, &work->work);
 	} else
 		list_add_tail(&work->list, &cm_id_priv->work_list);
 out:
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 
 static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
 				  struct ib_qp_attr *qp_attr,
 				  int *qp_attr_mask)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_IDLE:
 	case IW_CM_STATE_CONN_SENT:
 	case IW_CM_STATE_CONN_RECV:
 	case IW_CM_STATE_ESTABLISHED:
 		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
 		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
 					   IB_ACCESS_REMOTE_READ;
 		ret = 0;
 		break;
 	default:
 		ret = -EINVAL;
 		break;
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 
 static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
 				  struct ib_qp_attr *qp_attr,
 				  int *qp_attr_mask)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	switch (cm_id_priv->state) {
 	case IW_CM_STATE_IDLE:
 	case IW_CM_STATE_CONN_SENT:
 	case IW_CM_STATE_CONN_RECV:
 	case IW_CM_STATE_ESTABLISHED:
 		*qp_attr_mask = 0;
 		ret = 0;
 		break;
 	default:
 		ret = -EINVAL;
 		break;
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 
 int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
 		       struct ib_qp_attr *qp_attr,
 		       int *qp_attr_mask)
 {
 	struct iwcm_id_private *cm_id_priv;
 	int ret;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
 	switch (qp_attr->qp_state) {
 	case IB_QPS_INIT:
 	case IB_QPS_RTR:
 		ret = iwcm_init_qp_init_attr(cm_id_priv,
 					     qp_attr, qp_attr_mask);
 		break;
 	case IB_QPS_RTS:
 		ret = iwcm_init_qp_rts_attr(cm_id_priv,
 					    qp_attr, qp_attr_mask);
 		break;
 	default:
 		ret = -EINVAL;
 		break;
 	}
 	return ret;
 }
 EXPORT_SYMBOL(iw_cm_init_qp_attr);
 
 static int __init iw_cm_init(void)
 {
 	iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
 	if (!iwcm_wq)
 		return -ENOMEM;
 
 	return 0;
 }
 
 static void __exit iw_cm_cleanup(void)
 {
 	destroy_workqueue(iwcm_wq);
 }
 
 module_init(iw_cm_init);
 module_exit(iw_cm_cleanup);
Index: head/sys/ofed/drivers/infiniband/core/umem.c
===================================================================
--- head/sys/ofed/drivers/infiniband/core/umem.c	(revision 320071)
+++ head/sys/ofed/drivers/infiniband/core/umem.c	(revision 320072)
@@ -1,445 +1,446 @@
 /*
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #define	LINUXKPI_PARAM_PREFIX ibcore_
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched.h>
 #include <linux/dma-attrs.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/wait.h>
 #include <sys/priv.h>
 #include <sys/resourcevar.h>
 #include <sys/vmmeter.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_map.h>
 #include "uverbs.h"
 
 #define IB_UMEM_MAX_PAGE_CHUNK		(PAGE_SIZE / sizeof (struct page *))
 
 static int allow_weak_ordering;
 module_param_named(weak_ordering, allow_weak_ordering, int, 0444);
 MODULE_PARM_DESC(weak_ordering,  "Allow weak ordering for data registered memory");
 
 static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem,
 				       struct ib_umem *umem, unsigned long addr,
 				       int dmasync, int invalidation_supported)
 {
 	int ret;
 	const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
 	struct invalidation_ctx *invalidation_ctx = NULL;
 
 	umem->ib_peer_mem = ib_peer_mem;
 	if (invalidation_supported) {
 		invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL);
 		if (!invalidation_ctx) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		umem->invalidation_ctx = invalidation_ctx;
 		invalidation_ctx->umem = umem;
 		mutex_lock(&ib_peer_mem->lock);
 		invalidation_ctx->context_ticket =
 				ib_peer_insert_context(ib_peer_mem, invalidation_ctx);
 		/* unlock before calling get pages to prevent a dead-lock from the callback */
 		mutex_unlock(&ib_peer_mem->lock);
 	}
 
 	ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1,
 				&umem->sg_head, 
 				umem->peer_mem_client_context,
 				invalidation_ctx ?
 				(void *)invalidation_ctx->context_ticket : NULL);
 
 	if (invalidation_ctx) {
 		/* taking the lock back, checking that wasn't invalidated at that time */
 		mutex_lock(&ib_peer_mem->lock);
 		if (invalidation_ctx->peer_invalidated) {
 			printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n");
 			ret = -EINVAL;
 		}
 	}
 
 	if (ret)
 		goto out;
 
 	umem->page_size = peer_mem->get_page_size
 					(umem->peer_mem_client_context);
 	if (umem->page_size <= 0)
 		goto put_pages;
 
 	umem->offset = addr & ((unsigned long)umem->page_size - 1);
 	ret = peer_mem->dma_map(&umem->sg_head,
 					umem->peer_mem_client_context,
 					umem->context->device->dma_device,
 					dmasync,
 					&umem->nmap);
 	if (ret)
 		goto put_pages;
 
 	ib_peer_mem->stats.num_reg_pages +=
 			umem->nmap * (umem->page_size >> PAGE_SHIFT);
 	ib_peer_mem->stats.num_alloc_mrs += 1;
 	return umem;
 
 put_pages:
 
 	peer_mem->put_pages(umem->peer_mem_client_context,
 					&umem->sg_head);
 out:
 	if (invalidation_ctx) {
 		ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
 		mutex_unlock(&umem->ib_peer_mem->lock);
 		kfree(invalidation_ctx);
 	}
 
 	ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
 				umem->peer_mem_srcu_key);
 	kfree(umem);
 	return ERR_PTR(ret);
 }
 
 static void peer_umem_release(struct ib_umem *umem)
 {
 	struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem;
 	const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
 	struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
 
 	if (invalidation_ctx) {
 
 		int peer_callback;
 		int inflight_invalidation;
 		/* If we are not under peer callback we must take the lock before removing
 		  * core ticket from the tree and releasing its umem.
 		  * It will let any inflight callbacks to be ended safely.
 		  * If we are under peer callback or under error flow of reg_mr so that context
 		  * wasn't activated yet lock was already taken.
 		*/
 		if (invalidation_ctx->func && !invalidation_ctx->peer_callback)
 			mutex_lock(&ib_peer_mem->lock);
 		ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
 		/* make sure to check inflight flag after took the lock and remove from tree.
 		  * in addition, from that point using local variables for peer_callback and
 		  * inflight_invalidation as after the complete invalidation_ctx can't be accessed
 		  * any more as it may be freed by the callback.
 		*/
 		peer_callback = invalidation_ctx->peer_callback;
 		inflight_invalidation = invalidation_ctx->inflight_invalidation;
 		if (inflight_invalidation)
 			complete(&invalidation_ctx->comp);
 		/* On peer callback lock is handled externally */
 		if (!peer_callback)
 			/* unlocking before put_pages */
 			mutex_unlock(&ib_peer_mem->lock);
 		/* in case under callback context or callback is pending let it free the invalidation context */
 		if (!peer_callback && !inflight_invalidation)
 			kfree(invalidation_ctx);
 	}
 
 	peer_mem->dma_unmap(&umem->sg_head,
 					umem->peer_mem_client_context,
 					umem->context->device->dma_device);
 	peer_mem->put_pages(&umem->sg_head,
 					  umem->peer_mem_client_context);
 
 	ib_peer_mem->stats.num_dereg_pages +=
 			umem->nmap * (umem->page_size >> PAGE_SHIFT);
 	ib_peer_mem->stats.num_dealloc_mrs += 1;
 	ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
 				umem->peer_mem_srcu_key);
 	kfree(umem);
 
 	return;
 
 }
 
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
 
 	vm_object_t object;
 	struct scatterlist *sg;
 	struct page *page;
 	int i;
 
 	object = NULL;
 	if (umem->nmap > 0)
 		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
 			umem->nmap,
 			DMA_BIDIRECTIONAL);
 	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
 		page = sg_page(sg);
 			if (umem->writable && dirty) {
 				if (object && object != page->object)
 					VM_OBJECT_WUNLOCK(object);
 				if (object != page->object) {
 					object = page->object;
 					VM_OBJECT_WLOCK(object);
 				}
 				vm_page_dirty(page);
 			}
 		}
 	sg_free_table(&umem->sg_head);
 	if (object)
 		VM_OBJECT_WUNLOCK(object);
 
 }
 
 void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
 					       umem_invalidate_func_t func,
 					       void *cookie)
 {
 	struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
 
 	invalidation_ctx->func = func;
 	invalidation_ctx->cookie = cookie;
 
 	/* from that point any pending invalidations can be called */
 	mutex_unlock(&umem->ib_peer_mem->lock);
 	return;
 }
 EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  * @context: userspace context to pin memory for
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
  * @access: IB_ACCESS_xxx flags for memory being pinned
  * @dmasync: flush in-flight DMA when the memory region is written
  */
 struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr,
 			    size_t size, int access, int dmasync,
 			    int invalidation_supported)
 {
 
 	struct ib_umem *umem;
         struct proc *proc;
 	pmap_t pmap;
         vm_offset_t end, last, start;
         vm_size_t npages;
         int error;
 	int ret;
 	int ents;
 	int i;
 	DEFINE_DMA_ATTRS(attrs);
 	struct scatterlist *sg, *sg_list_start;
 	int need_release = 0;
 
 	error = priv_check(curthread, PRIV_VM_MLOCK);
 	if (error)
 		return ERR_PTR(-error);
 
 	last = addr + size;
 	start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
 	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
 	if (last < addr || end < addr)
 		return ERR_PTR(-EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return ERR_PTR(-ENOMEM);
 	umem = kzalloc(sizeof *umem, GFP_KERNEL);
 	if (!umem)
 		return ERR_PTR(-ENOMEM);
 	proc = curthread->td_proc;
 	PROC_LOCK(proc);
 	if (ptoa(npages +
 	    pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
 	    lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		kfree(umem);
 		return ERR_PTR(-ENOMEM);
 	}
         PROC_UNLOCK(proc);
 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired) {
 		kfree(umem);
 		return ERR_PTR(-EAGAIN);
 	}
 	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES |
 	    (umem->writable ? VM_MAP_WIRE_WRITE : 0));
 	if (error != KERN_SUCCESS) {
 		kfree(umem);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	umem->context   = context;
 	umem->length    = size;
 	umem->offset    = addr & ~PAGE_MASK;
 	umem->page_size = PAGE_SIZE;
 	umem->start	= addr;
 	/*
 	 * We ask for writable memory if any access flags other than
 	 * "remote read" are set.  "Local write" and "remote write"
 	 * obviously require write access.  "Remote atomic" can do
 	 * things like fetch and add, which will modify memory, and
 	 * "MW bind" can change permissions by binding a window.
 	 */
 	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
 
 	if (invalidation_supported || context->peer_mem_private_data) {
 
 		struct ib_peer_memory_client *peer_mem_client;
 
 		peer_mem_client =  ib_get_peer_client(context, addr, size,
 			&umem->peer_mem_client_context,
 				&umem->peer_mem_srcu_key);
 		if (peer_mem_client)
 			return peer_umem_get(peer_mem_client, umem, addr,
 				dmasync, invalidation_supported);
 	}
 
 	umem->hugetlb = 0;
 
 	pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
 
 	if (npages == 0) {
 		ret = -EINVAL;
 			goto out;
 		}
 
 	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
 	if (ret)
 		goto out;
 
 	need_release = 1;
 	sg_list_start = umem->sg_head.sgl;
 
 	while (npages) {
 
 		ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
 		umem->npages += ents;
 
 		for_each_sg(sg_list_start, sg, ents, i) {
 			vm_paddr_t pa;
 
 			pa = pmap_extract(pmap, start);
 			if (pa == 0) {
 				ret = -ENOMEM;
 				goto out;
 			}
 			sg_set_page(sg, PHYS_TO_VM_PAGE(pa),
 			    PAGE_SIZE, 0);
 			npages--;
 			start += PAGE_SIZE;
 		}
 
 		/* preparing for next loop */
 		sg_list_start = sg;
 	}
 
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
 					umem->sg_head.sgl,
 					umem->npages,
 						  DMA_BIDIRECTIONAL,
 						  &attrs);
 	if (umem->nmap != umem->npages) {
 			ret = -ENOMEM;
 			goto out;
 		}
 
 out:
 	if (ret < 0) {
 		if (need_release)
 		__ib_umem_release(context->device, umem, 0);
 		kfree(umem);
 	}
 
 	return ret < 0 ? ERR_PTR(ret) : umem;
 }
 EXPORT_SYMBOL(ib_umem_get_ex);
 
 struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 			    size_t size, int access, int dmasync)
 {
 	return ib_umem_get_ex(context, addr,
 			    size, access, dmasync, 0);
 }
 EXPORT_SYMBOL(ib_umem_get);
 
 /**
  * ib_umem_release - release memory pinned with ib_umem_get
  * @umem: umem struct to release
  */
 void ib_umem_release(struct ib_umem *umem)
 {
 
 	vm_offset_t addr, end, last, start;
 	vm_size_t size;
 	int error;
 
 	if (umem->ib_peer_mem) {
 		peer_umem_release(umem);
 		return;
 	}
 
 	__ib_umem_release(umem->context->device, umem, 1);
 
 	if (umem->context->closing) {
 		kfree(umem);
 		return;
 	}
 
 	error = priv_check(curthread, PRIV_VM_MUNLOCK);
 
 	if (error)
 		return;
 
 	addr = umem->start;
 	size = umem->length;
 	last = addr + size;
         start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
 	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
 	vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 	kfree(umem);
 
 }
 EXPORT_SYMBOL(ib_umem_release);
 
 int ib_umem_page_count(struct ib_umem *umem)
 {
 	int shift;
 	int i;
 	int n;
 	struct scatterlist *sg;
 
 	shift = ilog2(umem->page_size);
 
 	n = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
 		n += sg_dma_len(sg) >> shift;
 
 	return n;
 }
 EXPORT_SYMBOL(ib_umem_page_count);
Index: head/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
===================================================================
--- head/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h	(revision 320071)
+++ head/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h	(revision 320072)
@@ -1,596 +1,597 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #ifndef MTHCA_DEV_H
 #define MTHCA_DEV_H
 
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/timer.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/semaphore.h>
+#include <linux/wait.h>
 
 #include "mthca_provider.h"
 #include "mthca_doorbell.h"
 
 #define DRV_NAME	"ib_mthca"
 #define PFX		DRV_NAME ": "
 #define DRV_VERSION	"1.0-ofed1.5.2"
 #define DRV_RELDATE	"August 4, 2010"
 
 enum {
 	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
 	MTHCA_FLAG_SRQ        = 1 << 2,
 	MTHCA_FLAG_MSI_X      = 1 << 3,
 	MTHCA_FLAG_NO_LAM     = 1 << 4,
 	MTHCA_FLAG_FMR        = 1 << 5,
 	MTHCA_FLAG_MEMFREE    = 1 << 6,
 	MTHCA_FLAG_PCIE       = 1 << 7,
 	MTHCA_FLAG_SINAI_OPT  = 1 << 8
 };
 
 enum {
 	MTHCA_MAX_PORTS = 2
 };
 
 enum {
 	MTHCA_BOARD_ID_LEN = 64
 };
 
 enum {
 	MTHCA_EQ_CONTEXT_SIZE =  0x40,
 	MTHCA_CQ_CONTEXT_SIZE =  0x40,
 	MTHCA_QP_CONTEXT_SIZE = 0x200,
 	MTHCA_RDB_ENTRY_SIZE  =  0x20,
 	MTHCA_AV_SIZE         =  0x20,
 	MTHCA_MGM_ENTRY_SIZE  = 0x100,
 
 	/* Arbel FW gives us these, but we need them for Tavor */
 	MTHCA_MPT_ENTRY_SIZE  =  0x40,
 	MTHCA_MTT_SEG_SIZE    =  0x40,
 
 	MTHCA_QP_PER_MGM      = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
 };
 
 enum {
 	MTHCA_EQ_CMD,
 	MTHCA_EQ_ASYNC,
 	MTHCA_EQ_COMP,
 	MTHCA_NUM_EQ
 };
 
 enum {
 	MTHCA_OPCODE_NOP            = 0x00,
 	MTHCA_OPCODE_RDMA_WRITE     = 0x08,
 	MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
 	MTHCA_OPCODE_SEND           = 0x0a,
 	MTHCA_OPCODE_SEND_IMM       = 0x0b,
 	MTHCA_OPCODE_RDMA_READ      = 0x10,
 	MTHCA_OPCODE_ATOMIC_CS      = 0x11,
 	MTHCA_OPCODE_ATOMIC_FA      = 0x12,
 	MTHCA_OPCODE_BIND_MW        = 0x18,
 	MTHCA_OPCODE_INVALID        = 0xff
 };
 
 enum {
 	MTHCA_CMD_USE_EVENTS         = 1 << 0,
 	MTHCA_CMD_POST_DOORBELLS     = 1 << 1
 };
 
 enum {
 	MTHCA_CMD_NUM_DBELL_DWORDS = 8
 };
 
 struct mthca_cmd {
 	struct pci_pool          *pool;
 	struct mutex              hcr_mutex;
 	struct semaphore 	  poll_sem;
 	struct semaphore 	  event_sem;
 	int              	  max_cmds;
 	spinlock_t                context_lock;
 	int                       free_head;
 	struct mthca_cmd_context *context;
 	u16                       token_mask;
 	u32                       flags;
 	void __iomem             *dbell_map;
 	u16                       dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS];
 };
 
 struct mthca_limits {
 	int      num_ports;
 	int      vl_cap;
 	int      mtu_cap;
 	int      gid_table_len;
 	int      pkey_table_len;
 	int      local_ca_ack_delay;
 	int      num_uars;
 	int      max_sg;
 	int      num_qps;
 	int      max_wqes;
 	int	 max_desc_sz;
 	int	 max_qp_init_rdma;
 	int      reserved_qps;
 	int      num_srqs;
 	int      max_srq_wqes;
 	int      max_srq_sge;
 	int      reserved_srqs;
 	int      num_eecs;
 	int      reserved_eecs;
 	int      num_cqs;
 	int      max_cqes;
 	int      reserved_cqs;
 	int      num_eqs;
 	int      reserved_eqs;
 	int      num_mpts;
 	int      num_mtt_segs;
 	int	 mtt_seg_size;
 	int      fmr_reserved_mtts;
 	int      reserved_mtts;
 	int      reserved_mrws;
 	int      reserved_uars;
 	int      num_mgms;
 	int      num_amgms;
 	int      reserved_mcgs;
 	int      num_pds;
 	int      reserved_pds;
 	u32      page_size_cap;
 	u32      flags;
 	u16      stat_rate_support;
 	u8       port_width_cap;
 };
 
 struct mthca_alloc {
 	u32            last;
 	u32            top;
 	u32            max;
 	u32            mask;
 	spinlock_t     lock;
 	unsigned long *table;
 };
 
 struct mthca_array {
 	struct {
 		void    **page;
 		int       used;
 	} *page_list;
 };
 
 struct mthca_uar_table {
 	struct mthca_alloc alloc;
 	u64                uarc_base;
 	int                uarc_size;
 };
 
 struct mthca_pd_table {
 	struct mthca_alloc alloc;
 };
 
 struct mthca_buddy {
 	unsigned long **bits;
 	int	       *num_free;
 	int             max_order;
 	spinlock_t      lock;
 };
 
 struct mthca_mr_table {
 	struct mthca_alloc      mpt_alloc;
 	struct mthca_buddy      mtt_buddy;
 	struct mthca_buddy     *fmr_mtt_buddy;
 	u64                     mtt_base;
 	u64                     mpt_base;
 	struct mthca_icm_table *mtt_table;
 	struct mthca_icm_table *mpt_table;
 	struct {
 		void __iomem   *mpt_base;
 		void __iomem   *mtt_base;
 		struct mthca_buddy mtt_buddy;
 	} tavor_fmr;
 };
 
 struct mthca_eq_table {
 	struct mthca_alloc alloc;
 	void __iomem      *clr_int;
 	u32                clr_mask;
 	u32                arm_mask;
 	struct mthca_eq    eq[MTHCA_NUM_EQ];
 	u64                icm_virt;
 	struct page       *icm_page;
 	dma_addr_t         icm_dma;
 	int                have_irq;
 	u8                 inta_pin;
 };
 
 struct mthca_cq_table {
 	struct mthca_alloc 	alloc;
 	spinlock_t         	lock;
 	struct mthca_array      cq;
 	struct mthca_icm_table *table;
 };
 
 struct mthca_srq_table {
 	struct mthca_alloc 	alloc;
 	spinlock_t         	lock;
 	struct mthca_array      srq;
 	struct mthca_icm_table *table;
 };
 
 struct mthca_qp_table {
 	struct mthca_alloc     	alloc;
 	u32                    	rdb_base;
 	int                    	rdb_shift;
 	int                    	sqp_start;
 	spinlock_t             	lock;
 	struct mthca_array     	qp;
 	struct mthca_icm_table *qp_table;
 	struct mthca_icm_table *eqp_table;
 	struct mthca_icm_table *rdb_table;
 };
 
 struct mthca_av_table {
 	struct pci_pool   *pool;
 	int                num_ddr_avs;
 	u64                ddr_av_base;
 	void __iomem      *av_map;
 	struct mthca_alloc alloc;
 };
 
 struct mthca_mcg_table {
 	struct mutex		mutex;
 	struct mthca_alloc 	alloc;
 	struct mthca_icm_table *table;
 };
 
 struct mthca_catas_err {
 	u64			addr;
 	u32 __iomem	       *map;
 	u32			size;
 	struct timer_list	timer;
 	struct list_head	list;
 };
 
 extern struct mutex mthca_device_mutex;
 
 struct mthca_dev {
 	struct ib_device  ib_dev;
 	struct pci_dev   *pdev;
 
 	int          	 hca_type;
 	unsigned long	 mthca_flags;
 	unsigned long    device_cap_flags;
 
 	u32              rev_id;
 	char             board_id[MTHCA_BOARD_ID_LEN];
 
 	/* firmware info */
 	u64              fw_ver;
 	union {
 		struct {
 			u64 fw_start;
 			u64 fw_end;
 		}        tavor;
 		struct {
 			u64 clr_int_base;
 			u64 eq_arm_base;
 			u64 eq_set_ci_base;
 			struct mthca_icm *fw_icm;
 			struct mthca_icm *aux_icm;
 			u16 fw_pages;
 		}        arbel;
 	}                fw;
 
 	u64              ddr_start;
 	u64              ddr_end;
 
 	MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
 	struct mutex cap_mask_mutex;
 
 	void __iomem    *hcr;
 	void __iomem    *kar;
 	void __iomem    *clr_base;
 	union {
 		struct {
 			void __iomem *ecr_base;
 		} tavor;
 		struct {
 			void __iomem *eq_arm;
 			void __iomem *eq_set_ci_base;
 		} arbel;
 	} eq_regs;
 
 	struct mthca_cmd    cmd;
 	struct mthca_limits limits;
 
 	struct mthca_uar_table uar_table;
 	struct mthca_pd_table  pd_table;
 	struct mthca_mr_table  mr_table;
 	struct mthca_eq_table  eq_table;
 	struct mthca_cq_table  cq_table;
 	struct mthca_srq_table srq_table;
 	struct mthca_qp_table  qp_table;
 	struct mthca_av_table  av_table;
 	struct mthca_mcg_table mcg_table;
 
 	struct mthca_catas_err catas_err;
 
 	struct mthca_uar       driver_uar;
 	struct mthca_db_table *db_tab;
 	struct mthca_pd        driver_pd;
 	struct mthca_mr        driver_mr;
 
 	struct ib_mad_agent  *send_agent[MTHCA_MAX_PORTS][2];
 	struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
 	spinlock_t            sm_lock;
 	u8                    rate[MTHCA_MAX_PORTS];
 	int		      active;
 };
 
 #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
 extern int mthca_debug_level;
 
 #define mthca_dbg(mdev, format, arg...)					\
 	do {								\
 		if (mthca_debug_level)					\
 			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
 	} while (0)
 
 #else /* CONFIG_INFINIBAND_MTHCA_DEBUG */
 
 #define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0)
 
 #endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
 
 #define mthca_err(mdev, format, arg...) \
 	dev_err(&mdev->pdev->dev, format, ## arg)
 #define mthca_info(mdev, format, arg...) \
 	dev_info(&mdev->pdev->dev, format, ## arg)
 #define mthca_warn(mdev, format, arg...) \
 	dev_warn(&mdev->pdev->dev, format, ## arg)
 
 extern void __buggy_use_of_MTHCA_GET(void);
 extern void __buggy_use_of_MTHCA_PUT(void);
 
 #define MTHCA_GET(dest, source, offset)                               \
 	do {                                                          \
 		void *__p = (char *) (source) + (offset);             \
 		switch (sizeof (dest)) {                              \
 		case 1: (dest) = *(u8 *) __p;       break;	      \
 		case 2: (dest) = be16_to_cpup(__p); break;	      \
 		case 4: (dest) = be32_to_cpup(__p); break;	      \
 		case 8: (dest) = be64_to_cpup(__p); break;	      \
 		default: __buggy_use_of_MTHCA_GET();		      \
 		}                                                     \
 	} while (0)
 
 #define MTHCA_PUT(dest, source, offset)                               \
 	do {                                                          \
 		void *__d = ((char *) (dest) + (offset));	      \
 		switch (sizeof(source)) {                             \
 		case 1: *(u8 *) __d = (source);                break; \
 		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
 		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
 		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
 		default: __buggy_use_of_MTHCA_PUT();		      \
 		}                                                     \
 	} while (0)
 
 int mthca_reset(struct mthca_dev *mdev);
 
 u32 mthca_alloc(struct mthca_alloc *alloc);
 void mthca_free(struct mthca_alloc *alloc, u32 obj);
 int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
 		     u32 reserved);
 void mthca_alloc_cleanup(struct mthca_alloc *alloc);
 void *mthca_array_get(struct mthca_array *array, int index);
 int mthca_array_set(struct mthca_array *array, int index, void *value);
 void mthca_array_clear(struct mthca_array *array, int index);
 int mthca_array_init(struct mthca_array *array, int nent);
 void mthca_array_cleanup(struct mthca_array *array, int nent);
 int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
 		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
 		    int hca_write, struct mthca_mr *mr);
 void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
 		    int is_direct, struct mthca_mr *mr);
 
 int mthca_init_uar_table(struct mthca_dev *dev);
 int mthca_init_pd_table(struct mthca_dev *dev);
 int mthca_init_mr_table(struct mthca_dev *dev);
 int mthca_init_eq_table(struct mthca_dev *dev);
 int mthca_init_cq_table(struct mthca_dev *dev);
 int mthca_init_srq_table(struct mthca_dev *dev);
 int mthca_init_qp_table(struct mthca_dev *dev);
 int mthca_init_av_table(struct mthca_dev *dev);
 int mthca_init_mcg_table(struct mthca_dev *dev);
 
 void mthca_cleanup_uar_table(struct mthca_dev *dev);
 void mthca_cleanup_pd_table(struct mthca_dev *dev);
 void mthca_cleanup_mr_table(struct mthca_dev *dev);
 void mthca_cleanup_eq_table(struct mthca_dev *dev);
 void mthca_cleanup_cq_table(struct mthca_dev *dev);
 void mthca_cleanup_srq_table(struct mthca_dev *dev);
 void mthca_cleanup_qp_table(struct mthca_dev *dev);
 void mthca_cleanup_av_table(struct mthca_dev *dev);
 void mthca_cleanup_mcg_table(struct mthca_dev *dev);
 
 int mthca_register_device(struct mthca_dev *dev);
 void mthca_unregister_device(struct mthca_dev *dev);
 
 void mthca_start_catas_poll(struct mthca_dev *dev);
 void mthca_stop_catas_poll(struct mthca_dev *dev);
 int __mthca_restart_one(struct pci_dev *pdev);
 int mthca_catas_init(void);
 void mthca_catas_cleanup(void);
 
 int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
 void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
 
 int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd);
 void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
 
 int mthca_write_mtt_size(struct mthca_dev *dev);
 
 struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size);
 void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt);
 int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
 		    int start_index, u64 *buffer_list, int list_len);
 int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
 		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr);
 int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
 			   u32 access, struct mthca_mr *mr);
 int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
 			u64 *buffer_list, int buffer_size_shift,
 			int list_len, u64 iova, u64 total_size,
 			u32 access, struct mthca_mr *mr);
 void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
 
 int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 		    u32 access, struct mthca_fmr *fmr);
 int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
 			     int list_len, u64 iova);
 void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
 int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
 			     int list_len, u64 iova);
 void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
 int mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
 
 int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
 void mthca_unmap_eq_icm(struct mthca_dev *dev);
 
 int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
 		  struct ib_wc *entry);
 int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 int mthca_init_cq(struct mthca_dev *dev, int nent,
 		  struct mthca_ucontext *ctx, u32 pdn,
 		  struct mthca_cq *cq);
 void mthca_free_cq(struct mthca_dev *dev,
 		   struct mthca_cq *cq);
 void mthca_cq_completion(struct mthca_dev *dev, u32 cqn);
 void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
 		    enum ib_event_type event_type);
 void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
 		    struct mthca_srq *srq);
 void mthca_cq_resize_copy_cqes(struct mthca_cq *cq);
 int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent);
 void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
 
 int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
 		    struct ib_srq_attr *attr, struct mthca_srq *srq);
 void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
 int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
 int mthca_max_srq_sge(struct mthca_dev *dev);
 void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
 		     enum ib_event_type event_type);
 void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr);
 int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
 			      struct ib_recv_wr **bad_wr);
 int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
 			      struct ib_recv_wr **bad_wr);
 
 void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
 		    enum ib_event_type event_type);
 int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
 		   struct ib_qp_init_attr *qp_init_attr);
 int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 		    struct ib_udata *udata);
 int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			  struct ib_send_wr **bad_wr);
 int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			     struct ib_recv_wr **bad_wr);
 int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			  struct ib_send_wr **bad_wr);
 int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			     struct ib_recv_wr **bad_wr);
 void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
 			int index, int *dbd, __be32 *new_wqe);
 int mthca_alloc_qp(struct mthca_dev *dev,
 		   struct mthca_pd *pd,
 		   struct mthca_cq *send_cq,
 		   struct mthca_cq *recv_cq,
 		   enum ib_qp_type type,
 		   enum ib_sig_type send_policy,
 		   struct ib_qp_cap *cap,
 		   struct mthca_qp *qp);
 int mthca_alloc_sqp(struct mthca_dev *dev,
 		    struct mthca_pd *pd,
 		    struct mthca_cq *send_cq,
 		    struct mthca_cq *recv_cq,
 		    enum ib_sig_type send_policy,
 		    struct ib_qp_cap *cap,
 		    int qpn,
 		    int port,
 		    struct mthca_sqp *sqp);
 void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
 int mthca_create_ah(struct mthca_dev *dev,
 		    struct mthca_pd *pd,
 		    struct ib_ah_attr *ah_attr,
 		    struct mthca_ah *ah);
 int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
 int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
 		  struct ib_ud_header *header);
 int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr);
 int mthca_ah_grh_present(struct mthca_ah *ah);
 u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port);
 enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port);
 
 int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
 int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
 
 int mthca_process_mad(struct ib_device *ibdev,
 		      int mad_flags,
 		      u8 port_num,
 		      struct ib_wc *in_wc,
 		      struct ib_grh *in_grh,
 		      struct ib_mad *in_mad,
 		      struct ib_mad *out_mad);
 int mthca_create_agents(struct mthca_dev *dev);
 void mthca_free_agents(struct mthca_dev *dev);
 
 static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
 {
 	return container_of(ibdev, struct mthca_dev, ib_dev);
 }
 
 static inline int mthca_is_memfree(struct mthca_dev *dev)
 {
 	return dev->mthca_flags & MTHCA_FLAG_MEMFREE;
 }
 
 #endif /* MTHCA_DEV_H */