Index: head/sys/kern/kern_alq.c
===================================================================
--- head/sys/kern/kern_alq.c	(revision 296687)
+++ head/sys/kern/kern_alq.c	(revision 296688)
@@ -1,973 +1,973 @@
 /*-
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2008-2009, Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2009-2010, The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/alq.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/fcntl.h>
 #include <sys/eventhandler.h>
 
 #include <security/mac/mac_framework.h>
 
 /* Async. Logging Queue */
 struct alq {
 	char	*aq_entbuf;		/* Buffer for stored entries */
 	int	aq_entmax;		/* Max entries */
 	int	aq_entlen;		/* Entry length */
 	int	aq_freebytes;		/* Bytes available in buffer */
 	int	aq_buflen;		/* Total length of our buffer */
 	int	aq_writehead;		/* Location for next write */
 	int	aq_writetail;		/* Flush starts at this location */
 	int	aq_wrapearly;		/* # bytes left blank at end of buf */
 	int	aq_flags;		/* Queue flags */
 	int	aq_waiters;		/* Num threads waiting for resources
 					 * NB: Used as a wait channel so must
 					 * not be first field in the alq struct
 					 */
 	struct	ale	aq_getpost;	/* ALE for use by get/post */
 	struct mtx	aq_mtx;		/* Queue lock */
 	struct vnode	*aq_vp;		/* Open vnode handle */
 	struct ucred	*aq_cred;	/* Credentials of the opening thread */
 	LIST_ENTRY(alq)	aq_act;		/* List of active queues */
 	LIST_ENTRY(alq)	aq_link;	/* List of all queues */
 };
 
 #define	AQ_WANTED	0x0001		/* Wakeup sleeper when io is done */
 #define	AQ_ACTIVE	0x0002		/* on the active list */
 #define	AQ_FLUSHING	0x0004		/* doing IO */
 #define	AQ_SHUTDOWN	0x0008		/* Queue no longer valid */
 #define	AQ_ORDERED	0x0010		/* Queue enforces ordered writes */
 #define	AQ_LEGACY	0x0020		/* Legacy queue (fixed length writes) */
 
 #define	ALQ_LOCK(alq)	mtx_lock_spin(&(alq)->aq_mtx)
 #define	ALQ_UNLOCK(alq)	mtx_unlock_spin(&(alq)->aq_mtx)
 
 #define HAS_PENDING_DATA(alq) ((alq)->aq_freebytes != (alq)->aq_buflen)
 
 static MALLOC_DEFINE(M_ALD, "ALD", "ALD");
 
 /*
  * The ald_mtx protects the ald_queues list and the ald_active list.
  */
 static struct mtx ald_mtx;
 static LIST_HEAD(, alq) ald_queues;
 static LIST_HEAD(, alq) ald_active;
 static int ald_shutingdown = 0;
 struct thread *ald_thread;
 static struct proc *ald_proc;
 static eventhandler_tag alq_eventhandler_tag = NULL;
 
 #define	ALD_LOCK()	mtx_lock(&ald_mtx)
 #define	ALD_UNLOCK()	mtx_unlock(&ald_mtx)
 
 /* Daemon functions */
 static int ald_add(struct alq *);
 static int ald_rem(struct alq *);
 static void ald_startup(void *);
 static void ald_daemon(void);
 static void ald_shutdown(void *, int);
 static void ald_activate(struct alq *);
 static void ald_deactivate(struct alq *);
 
 /* Internal queue functions */
 static void alq_shutdown(struct alq *);
 static void alq_destroy(struct alq *);
 static int alq_doio(struct alq *);
 
 
 /*
  * Add a new queue to the global list.  Fail if we're shutting down.
  */
 static int
 ald_add(struct alq *alq)
 {
 	int error;
 
 	error = 0;
 
 	ALD_LOCK();
 	if (ald_shutingdown) {
 		error = EBUSY;
 		goto done;
 	}
 	LIST_INSERT_HEAD(&ald_queues, alq, aq_link);
 done:
 	ALD_UNLOCK();
 	return (error);
 }
 
 /*
  * Remove a queue from the global list unless we're shutting down.  If so,
  * the ald will take care of cleaning up it's resources.
  */
 static int
 ald_rem(struct alq *alq)
 {
 	int error;
 
 	error = 0;
 
 	ALD_LOCK();
 	if (ald_shutingdown) {
 		error = EBUSY;
 		goto done;
 	}
 	LIST_REMOVE(alq, aq_link);
 done:
 	ALD_UNLOCK();
 	return (error);
 }
 
 /*
  * Put a queue on the active list.  This will schedule it for writing.
  */
 static void
 ald_activate(struct alq *alq)
 {
 	LIST_INSERT_HEAD(&ald_active, alq, aq_act);
 	wakeup(&ald_active);
 }
 
 static void
 ald_deactivate(struct alq *alq)
 {
 	LIST_REMOVE(alq, aq_act);
 	alq->aq_flags &= ~AQ_ACTIVE;
 }
 
 static void
 ald_startup(void *unused)
 {
 	mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET);
 	LIST_INIT(&ald_queues);
 	LIST_INIT(&ald_active);
 }
 
 static void
 ald_daemon(void)
 {
 	int needwakeup;
 	struct alq *alq;
 
 	ald_thread = FIRST_THREAD_IN_PROC(ald_proc);
 
 	alq_eventhandler_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 	    ald_shutdown, NULL, SHUTDOWN_PRI_FIRST);
 
 	ALD_LOCK();
 
 	for (;;) {
 		while ((alq = LIST_FIRST(&ald_active)) == NULL &&
 		    !ald_shutingdown)
 			mtx_sleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0);
 
 		/* Don't shutdown until all active ALQs are flushed. */
 		if (ald_shutingdown && alq == NULL) {
 			ALD_UNLOCK();
 			break;
 		}
 
 		ALQ_LOCK(alq);
 		ald_deactivate(alq);
 		ALD_UNLOCK();
 		needwakeup = alq_doio(alq);
 		ALQ_UNLOCK(alq);
 		if (needwakeup)
 			wakeup_one(alq);
 		ALD_LOCK();
 	}
 
 	kproc_exit(0);
 }
 
 static void
 ald_shutdown(void *arg, int howto)
 {
 	struct alq *alq;
 
 	ALD_LOCK();
 
 	/* Ensure no new queues can be created. */
 	ald_shutingdown = 1;
 
 	/* Shutdown all ALQs prior to terminating the ald_daemon. */
 	while ((alq = LIST_FIRST(&ald_queues)) != NULL) {
 		LIST_REMOVE(alq, aq_link);
 		ALD_UNLOCK();
 		alq_shutdown(alq);
 		ALD_LOCK();
 	}
 
 	/* At this point, all ALQs are flushed and shutdown. */
 
 	/*
 	 * Wake ald_daemon so that it exits. It won't be able to do
 	 * anything until we mtx_sleep because we hold the ald_mtx.
 	 */
 	wakeup(&ald_active);
 
 	/* Wait for ald_daemon to exit. */
 	mtx_sleep(ald_proc, &ald_mtx, PWAIT, "aldslp", 0);
 
 	ALD_UNLOCK();
 }
 
 static void
 alq_shutdown(struct alq *alq)
 {
 	ALQ_LOCK(alq);
 
 	/* Stop any new writers. */
 	alq->aq_flags |= AQ_SHUTDOWN;
 
 	/*
 	 * If the ALQ isn't active but has unwritten data (possible if
 	 * the ALQ_NOACTIVATE flag has been used), explicitly activate the
 	 * ALQ here so that the pending data gets flushed by the ald_daemon.
 	 */
 	if (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq)) {
 		alq->aq_flags |= AQ_ACTIVE;
 		ALQ_UNLOCK(alq);
 		ALD_LOCK();
 		ald_activate(alq);
 		ALD_UNLOCK();
 		ALQ_LOCK(alq);
 	}
 
 	/* Drain IO */
 	while (alq->aq_flags & AQ_ACTIVE) {
 		alq->aq_flags |= AQ_WANTED;
 		msleep_spin(alq, &alq->aq_mtx, "aldclose", 0);
 	}
 	ALQ_UNLOCK(alq);
 
 	vn_close(alq->aq_vp, FWRITE, alq->aq_cred,
 	    curthread);
 	crfree(alq->aq_cred);
 }
 
 void
 alq_destroy(struct alq *alq)
 {
 	/* Drain all pending IO. */
 	alq_shutdown(alq);
 
 	mtx_destroy(&alq->aq_mtx);
 	free(alq->aq_entbuf, M_ALD);
 	free(alq, M_ALD);
 }
 
 /*
  * Flush all pending data to disk.  This operation will block.
  */
 static int
 alq_doio(struct alq *alq)
 {
 	struct thread *td;
 	struct mount *mp;
 	struct vnode *vp;
 	struct uio auio;
 	struct iovec aiov[2];
 	int totlen;
 	int iov;
 	int wrapearly;
 
 	KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
 
 	vp = alq->aq_vp;
 	td = curthread;
 	totlen = 0;
 	iov = 1;
 	wrapearly = alq->aq_wrapearly;
 
 	bzero(&aiov, sizeof(aiov));
 	bzero(&auio, sizeof(auio));
 
 	/* Start the write from the location of our buffer tail pointer. */
 	aiov[0].iov_base = alq->aq_entbuf + alq->aq_writetail;
 
 	if (alq->aq_writetail < alq->aq_writehead) {
 		/* Buffer not wrapped. */
 		totlen = aiov[0].iov_len = alq->aq_writehead - alq->aq_writetail;
 	} else if (alq->aq_writehead == 0) {
 		/* Buffer not wrapped (special case to avoid an empty iov). */
 		totlen = aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
 		    wrapearly;
 	} else {
 		/*
 		 * Buffer wrapped, requires 2 aiov entries:
 		 * - first is from writetail to end of buffer
 		 * - second is from start of buffer to writehead
 		 */
 		aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
 		    wrapearly;
 		iov++;
 		aiov[1].iov_base = alq->aq_entbuf;
 		aiov[1].iov_len =  alq->aq_writehead;
 		totlen = aiov[0].iov_len + aiov[1].iov_len;
 	}
 
 	alq->aq_flags |= AQ_FLUSHING;
 	ALQ_UNLOCK(alq);
 
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_iovcnt = iov;
 	auio.uio_resid = totlen;
 	auio.uio_td = td;
 
 	/*
 	 * Do all of the junk required to write now.
 	 */
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	/*
 	 * XXX: VOP_WRITE error checks are ignored.
 	 */
 #ifdef MAC
 	if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0)
 #endif
 		VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 
 	ALQ_LOCK(alq);
 	alq->aq_flags &= ~AQ_FLUSHING;
 
 	/* Adjust writetail as required, taking into account wrapping. */
 	alq->aq_writetail = (alq->aq_writetail + totlen + wrapearly) %
 	    alq->aq_buflen;
 	alq->aq_freebytes += totlen + wrapearly;
 
 	/*
 	 * If we just flushed part of the buffer which wrapped, reset the
 	 * wrapearly indicator.
 	 */
 	if (wrapearly)
 		alq->aq_wrapearly = 0;
 
 	/*
 	 * If we just flushed the buffer completely, reset indexes to 0 to
 	 * minimise buffer wraps.
 	 * This is also required to ensure alq_getn() can't wedge itself.
 	 */
 	if (!HAS_PENDING_DATA(alq))
 		alq->aq_writehead = alq->aq_writetail = 0;
 
 	KASSERT((alq->aq_writetail >= 0 && alq->aq_writetail < alq->aq_buflen),
 	    ("%s: aq_writetail < 0 || aq_writetail >= aq_buflen", __func__));
 
 	if (alq->aq_flags & AQ_WANTED) {
 		alq->aq_flags &= ~AQ_WANTED;
 		return (1);
 	}
 
 	return(0);
 }
 
 static struct kproc_desc ald_kp = {
         "ALQ Daemon",
         ald_daemon,
         &ald_proc
 };
 
 SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp);
 SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL);
 
 
 /* User visible queue functions */
 
 /*
  * Create the queue data structure, allocate the buffer, and open the file.
  */
 
 int
 alq_open_flags(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
     int size, int flags)
 {
 	struct thread *td;
 	struct nameidata nd;
 	struct alq *alq;
 	int oflags;
 	int error;
 
 	KASSERT((size > 0), ("%s: size <= 0", __func__));
 
 	*alqp = NULL;
 	td = curthread;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
 	oflags = FWRITE | O_NOFOLLOW | O_CREAT;
 
 	error = vn_open_cred(&nd, &oflags, cmode, 0, cred, NULL);
 	if (error)
 		return (error);
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	/* We just unlock so we hold a reference */
 	VOP_UNLOCK(nd.ni_vp, 0);
 
 	alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_vp = nd.ni_vp;
 	alq->aq_cred = crhold(cred);
 
 	mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET);
 
 	alq->aq_buflen = size;
 	alq->aq_entmax = 0;
 	alq->aq_entlen = 0;
 
 	alq->aq_freebytes = alq->aq_buflen;
 	alq->aq_entbuf = malloc(alq->aq_buflen, M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_writehead = alq->aq_writetail = 0;
 	if (flags & ALQ_ORDERED)
 		alq->aq_flags |= AQ_ORDERED;
 
 	if ((error = ald_add(alq)) != 0) {
 		alq_destroy(alq);
 		return (error);
 	}
 
 	*alqp = alq;
 
 	return (0);
 }
 
 int
 alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
     int size, int count)
 {
 	int ret;
 
 	KASSERT((count >= 0), ("%s: count < 0", __func__));
 
 	if (count > 0) {
 		if ((ret = alq_open_flags(alqp, file, cred, cmode,
 		    size*count, 0)) == 0) {
 			(*alqp)->aq_flags |= AQ_LEGACY;
 			(*alqp)->aq_entmax = count;
 			(*alqp)->aq_entlen = size;
 		}
 	} else
 		ret = alq_open_flags(alqp, file, cred, cmode, size, 0);
 
 	return (ret);
 }
 
 
 /*
  * Copy a new entry into the queue.  If the operation would block either
  * wait or return an error depending on the value of waitok.
  */
 int
 alq_writen(struct alq *alq, void *data, int len, int flags)
 {
 	int activate, copy, ret;
 	void *waitchan;
 
 	KASSERT((len > 0 && len <= alq->aq_buflen),
 	    ("%s: len <= 0 || len > aq_buflen", __func__));
 
 	activate = ret = 0;
 	copy = len;
 	waitchan = NULL;
 
 	ALQ_LOCK(alq);
 
 	/*
 	 * Fail to perform the write and return EWOULDBLOCK if:
 	 * - The message is larger than our underlying buffer.
 	 * - The ALQ is being shutdown.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the user can't wait for space.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the alq is inactive due to prior
 	 *   use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
 	 */
 	if (len > alq->aq_buflen ||
 	    alq->aq_flags & AQ_SHUTDOWN ||
 	    (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
 	    HAS_PENDING_DATA(alq))) && alq->aq_freebytes < len)) {
 		ALQ_UNLOCK(alq);
 		return (EWOULDBLOCK);
 	}
 
 	/*
 	 * If we want ordered writes and there is already at least one thread
 	 * waiting for resources to become available, sleep until we're woken.
 	 */
 	if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_waiters++;
 		msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqwnord", 0);
 		alq->aq_waiters--;
 	}
 
 	/*
 	 * (ALQ_WAITOK && aq_freebytes < len) or aq_freebytes >= len, either
 	 * enter while loop and sleep until we have enough free bytes (former)
 	 * or skip (latter). If AQ_ORDERED is set, only 1 thread at a time will
 	 * be in this loop. Otherwise, multiple threads may be sleeping here
 	 * competing for ALQ resources.
 	 */
 	while (alq->aq_freebytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_flags |= AQ_WANTED;
 		alq->aq_waiters++;
 		if (waitchan)
 			wakeup(waitchan);
 		msleep_spin(alq, &alq->aq_mtx, "alqwnres", 0);
 		alq->aq_waiters--;
 
 		/*
 		 * If we're the first thread to wake after an AQ_WANTED wakeup
 		 * but there isn't enough free space for us, we're going to loop
 		 * and sleep again. If there are other threads waiting in this
 		 * loop, schedule a wakeup so that they can see if the space
 		 * they require is available.
 		 */
 		if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
 		    alq->aq_freebytes < len && !(alq->aq_flags & AQ_WANTED))
 			waitchan = alq;
 		else
 			waitchan = NULL;
 	}
 
 	/*
 	 * If there are waiters, we need to signal the waiting threads after we
 	 * complete our work. The alq ptr is used as a wait channel for threads
 	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
 	 * are not allowed to concurrently compete for resources in the above
 	 * while loop, so we use a different wait channel in this case.
 	 */
 	if (alq->aq_waiters > 0) {
 		if (alq->aq_flags & AQ_ORDERED)
 			waitchan = &alq->aq_waiters;
 		else
 			waitchan = alq;
 	} else
 		waitchan = NULL;
 
 	/* Bail if we're shutting down. */
 	if (alq->aq_flags & AQ_SHUTDOWN) {
 		ret = EWOULDBLOCK;
 		goto unlock;
 	}
 
 	/*
 	 * If we need to wrap the buffer to accommodate the write,
 	 * we'll need 2 calls to bcopy.
 	 */
 	if ((alq->aq_buflen - alq->aq_writehead) < len)
 		copy = alq->aq_buflen - alq->aq_writehead;
 
 	/* Copy message (or part thereof if wrap required) to the buffer. */
 	bcopy(data, alq->aq_entbuf + alq->aq_writehead, copy);
 	alq->aq_writehead += copy;
 
 	if (alq->aq_writehead >= alq->aq_buflen) {
 		KASSERT((alq->aq_writehead == alq->aq_buflen),
 		    ("%s: alq->aq_writehead (%d) > alq->aq_buflen (%d)",
 		    __func__,
 		    alq->aq_writehead,
 		    alq->aq_buflen));
 		alq->aq_writehead = 0;
 	}
 
 	if (copy != len) {
 		/*
 		 * Wrap the buffer by copying the remainder of our message
 		 * to the start of the buffer and resetting aq_writehead.
 		 */
 		bcopy(((uint8_t *)data)+copy, alq->aq_entbuf, len - copy);
 		alq->aq_writehead = len - copy;
 	}
 
 	KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen),
 	    ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__));
 
 	alq->aq_freebytes -= len;
 
 	if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) {
 		alq->aq_flags |= AQ_ACTIVE;
 		activate = 1;
 	}
 
 	KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
 
 unlock:
 	ALQ_UNLOCK(alq);
 
 	if (activate) {
 		ALD_LOCK();
 		ald_activate(alq);
 		ALD_UNLOCK();
 	}
 
 	/* NB: We rely on wakeup_one waking threads in a FIFO manner. */
 	if (waitchan != NULL)
 		wakeup_one(waitchan);
 
 	return (ret);
 }
 
 int
 alq_write(struct alq *alq, void *data, int flags)
 {
 	/* Should only be called in fixed length message (legacy) mode. */
 	KASSERT((alq->aq_flags & AQ_LEGACY),
 	    ("%s: fixed length write on variable length queue", __func__));
 	return (alq_writen(alq, data, alq->aq_entlen, flags));
 }
 
 /*
  * Retrieve a pointer for the ALQ to write directly into, avoiding bcopy.
  */
 struct ale *
 alq_getn(struct alq *alq, int len, int flags)
 {
 	int contigbytes;
 	void *waitchan;
 
 	KASSERT((len > 0 && len <= alq->aq_buflen),
 	    ("%s: len <= 0 || len > alq->aq_buflen", __func__));
 
 	waitchan = NULL;
 
 	ALQ_LOCK(alq);
 
 	/*
 	 * Determine the number of free contiguous bytes.
 	 * We ensure elsewhere that if aq_writehead == aq_writetail because
 	 * the buffer is empty, they will both be set to 0 and therefore
 	 * aq_freebytes == aq_buflen and is fully contiguous.
 	 * If they are equal and the buffer is not empty, aq_freebytes will
 	 * be 0 indicating the buffer is full.
 	 */
 	if (alq->aq_writehead <= alq->aq_writetail)
 		contigbytes = alq->aq_freebytes;
 	else {
 		contigbytes = alq->aq_buflen - alq->aq_writehead;
 
 		if (contigbytes < len) {
 			/*
 			 * Insufficient space at end of buffer to handle a
 			 * contiguous write. Wrap early if there's space at
 			 * the beginning. This will leave a hole at the end
 			 * of the buffer which we will have to skip over when
 			 * flushing the buffer to disk.
 			 */
 			if (alq->aq_writetail >= len || flags & ALQ_WAITOK) {
 				/* Keep track of # bytes left blank. */
 				alq->aq_wrapearly = contigbytes;
 				/* Do the wrap and adjust counters. */
 				contigbytes = alq->aq_freebytes =
 				    alq->aq_writetail;
 				alq->aq_writehead = 0;
 			}
 		}
 	}
 
 	/*
 	 * Return a NULL ALE if:
 	 * - The message is larger than our underlying buffer.
 	 * - The ALQ is being shutdown.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the user can't wait for space.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the alq is inactive due to prior
 	 *   use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
 	 */
 	if (len > alq->aq_buflen ||
 	    alq->aq_flags & AQ_SHUTDOWN ||
 	    (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
 	    HAS_PENDING_DATA(alq))) && contigbytes < len)) {
 		ALQ_UNLOCK(alq);
 		return (NULL);
 	}
 
 	/*
 	 * If we want ordered writes and there is already at least one thread
 	 * waiting for resources to become available, sleep until we're woken.
 	 */
 	if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_waiters++;
 		msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqgnord", 0);
 		alq->aq_waiters--;
 	}
 
 	/*
 	 * (ALQ_WAITOK && contigbytes < len) or contigbytes >= len, either enter
 	 * while loop and sleep until we have enough contiguous free bytes
 	 * (former) or skip (latter). If AQ_ORDERED is set, only 1 thread at a
 	 * time will be in this loop. Otherwise, multiple threads may be
 	 * sleeping here competing for ALQ resources.
 	 */
 	while (contigbytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_flags |= AQ_WANTED;
 		alq->aq_waiters++;
 		if (waitchan)
 			wakeup(waitchan);
 		msleep_spin(alq, &alq->aq_mtx, "alqgnres", 0);
 		alq->aq_waiters--;
 
 		if (alq->aq_writehead <= alq->aq_writetail)
 			contigbytes = alq->aq_freebytes;
 		else
 			contigbytes = alq->aq_buflen - alq->aq_writehead;
 
 		/*
 		 * If we're the first thread to wake after an AQ_WANTED wakeup
 		 * but there isn't enough free space for us, we're going to loop
 		 * and sleep again. If there are other threads waiting in this
 		 * loop, schedule a wakeup so that they can see if the space
 		 * they require is available.
 		 */
 		if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
 		    contigbytes < len && !(alq->aq_flags & AQ_WANTED))
 			waitchan = alq;
 		else
 			waitchan = NULL;
 	}
 
 	/*
 	 * If there are waiters, we need to signal the waiting threads after we
 	 * complete our work. The alq ptr is used as a wait channel for threads
 	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
 	 * are not allowed to concurrently compete for resources in the above
 	 * while loop, so we use a different wait channel in this case.
 	 */
 	if (alq->aq_waiters > 0) {
 		if (alq->aq_flags & AQ_ORDERED)
 			waitchan = &alq->aq_waiters;
 		else
 			waitchan = alq;
 	} else
 		waitchan = NULL;
 
 	/* Bail if we're shutting down. */
 	if (alq->aq_flags & AQ_SHUTDOWN) {
 		ALQ_UNLOCK(alq);
 		if (waitchan != NULL)
 			wakeup_one(waitchan);
 		return (NULL);
 	}
 
 	/*
 	 * If we are here, we have a contiguous number of bytes >= len
 	 * available in our buffer starting at aq_writehead.
 	 */
 	alq->aq_getpost.ae_data = alq->aq_entbuf + alq->aq_writehead;
 	alq->aq_getpost.ae_bytesused = len;
 
 	return (&alq->aq_getpost);
 }
 
 struct ale *
 alq_get(struct alq *alq, int flags)
 {
 	/* Should only be called in fixed length message (legacy) mode. */
 	KASSERT((alq->aq_flags & AQ_LEGACY),
 	    ("%s: fixed length get on variable length queue", __func__));
 	return (alq_getn(alq, alq->aq_entlen, flags));
 }
 
 void
 alq_post_flags(struct alq *alq, struct ale *ale, int flags)
 {
 	int activate;
 	void *waitchan;
 
 	activate = 0;
 
 	if (ale->ae_bytesused > 0) {
 		if (!(alq->aq_flags & AQ_ACTIVE) &&
 		    !(flags & ALQ_NOACTIVATE)) {
 			alq->aq_flags |= AQ_ACTIVE;
 			activate = 1;
 		}
 
 		alq->aq_writehead += ale->ae_bytesused;
 		alq->aq_freebytes -= ale->ae_bytesused;
 
 		/* Wrap aq_writehead if we filled to the end of the buffer. */
 		if (alq->aq_writehead == alq->aq_buflen)
 			alq->aq_writehead = 0;
 
 		KASSERT((alq->aq_writehead >= 0 &&
 		    alq->aq_writehead < alq->aq_buflen),
 		    ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen",
 		    __func__));
 
 		KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
 	}
 
 	/*
 	 * If there are waiters, we need to signal the waiting threads after we
 	 * complete our work. The alq ptr is used as a wait channel for threads
 	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
 	 * are not allowed to concurrently compete for resources in the
 	 * alq_getn() while loop, so we use a different wait channel in this case.
 	 */
 	if (alq->aq_waiters > 0) {
 		if (alq->aq_flags & AQ_ORDERED)
 			waitchan = &alq->aq_waiters;
 		else
 			waitchan = alq;
 	} else
 		waitchan = NULL;
 
 	ALQ_UNLOCK(alq);
 
 	if (activate) {
 		ALD_LOCK();
 		ald_activate(alq);
 		ALD_UNLOCK();
 	}
 
 	/* NB: We rely on wakeup_one waking threads in a FIFO manner. */
 	if (waitchan != NULL)
 		wakeup_one(waitchan);
 }
 
 void
 alq_flush(struct alq *alq)
 {
 	int needwakeup = 0;
 
 	ALD_LOCK();
 	ALQ_LOCK(alq);
 
 	/*
 	 * Pull the lever iff there is data to flush and we're
 	 * not already in the middle of a flush operation.
 	 */
 	if (HAS_PENDING_DATA(alq) && !(alq->aq_flags & AQ_FLUSHING)) {
 		if (alq->aq_flags & AQ_ACTIVE)
 			ald_deactivate(alq);
 
 		ALD_UNLOCK();
 		needwakeup = alq_doio(alq);
 	} else
 		ALD_UNLOCK();
 
 	ALQ_UNLOCK(alq);
 
 	if (needwakeup)
 		wakeup_one(alq);
 }
 
 /*
  * Flush remaining data, close the file and free all resources.
  */
 void
 alq_close(struct alq *alq)
 {
 	/* Only flush and destroy alq if not already shutting down. */
 	if (ald_rem(alq) == 0)
 		alq_destroy(alq);
 }
 
 static int
 alq_load_handler(module_t mod, int what, void *arg)
 {
 	int ret;
 	
 	ret = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 	case MOD_SHUTDOWN:
 		break;
 
 	case MOD_QUIESCE:
 		ALD_LOCK();
 		/* Only allow unload if there are no open queues. */
 		if (LIST_FIRST(&ald_queues) == NULL) {
 			ald_shutingdown = 1;
 			ALD_UNLOCK();
 			EVENTHANDLER_DEREGISTER(shutdown_pre_sync,
 			    alq_eventhandler_tag);
 			ald_shutdown(NULL, 0);
 			mtx_destroy(&ald_mtx);
 		} else {
 			ALD_UNLOCK();
 			ret = EBUSY;
 		}
 		break;
 
 	case MOD_UNLOAD:
 		/* If MOD_QUIESCE failed we must fail here too. */
 		if (ald_shutingdown == 0)
 			ret = EBUSY;
 		break;
 
 	default:
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 static moduledata_t alq_mod =
 {
 	"alq",
 	alq_load_handler,
 	NULL
 };
 
-DECLARE_MODULE(alq, alq_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(alq, alq_mod, SI_SUB_LAST, SI_ORDER_ANY);
 MODULE_VERSION(alq, 1);
Index: head/sys/netinet/siftr.c
===================================================================
--- head/sys/netinet/siftr.c	(revision 296687)
+++ head/sys/netinet/siftr.c	(revision 296688)
@@ -1,1566 +1,1566 @@
 /*-
  * Copyright (c) 2007-2009
  * 	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010, The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /******************************************************
  * Statistical Information For TCP Research (SIFTR)
  *
  * A FreeBSD kernel module that adds very basic intrumentation to the
  * TCP stack, allowing internal stats to be recorded to a log file
  * for experimental, debugging and performance analysis purposes.
  *
  * SIFTR was first released in 2007 by James Healy and Lawrence Stewart whilst
  * working on the NewTCP research project at Swinburne University of
  * Technology's Centre for Advanced Internet Architectures, Melbourne,
  * Australia, which was made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  * More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  *
  * Work on SIFTR v1.2.x was sponsored by the FreeBSD Foundation as part of
  * the "Enhancing the FreeBSD TCP Implementation" project 2008-2009.
  * More details are available at:
  *   http://www.freebsdfoundation.org/
  *   http://caia.swin.edu.au/freebsd/etcp09/
  *
  * Lawrence Stewart is the current maintainer, and all contact regarding
  * SIFTR should be directed to him via email: lastewart@swin.edu.au
  *
  * Initial release date: June 2007
  * Most recent update: September 2010
  ******************************************************/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/alq.h>
 #include <sys/errno.h>
 #include <sys/eventhandler.h>
 #include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/pfil.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 
 #ifdef SIFTR_IPV6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #endif /* SIFTR_IPV6 */
 
 #include <machine/in_cksum.h>
 
 /*
  * Three digit version number refers to X.Y.Z where:
  * X is the major version number
  * Y is bumped to mark backwards incompatible changes
  * Z is bumped to mark backwards compatible changes
  */
 #define V_MAJOR		1
 #define V_BACKBREAK	2
 #define V_BACKCOMPAT	4
 #define MODVERSION	__CONCAT(V_MAJOR, __CONCAT(V_BACKBREAK, V_BACKCOMPAT))
 #define MODVERSION_STR	__XSTRING(V_MAJOR) "." __XSTRING(V_BACKBREAK) "." \
     __XSTRING(V_BACKCOMPAT)
 
 #define HOOK 0
 #define UNHOOK 1
 #define SIFTR_EXPECTED_MAX_TCP_FLOWS 65536
 #define SYS_NAME "FreeBSD"
 #define PACKET_TAG_SIFTR 100
 #define PACKET_COOKIE_SIFTR 21749576
 #define SIFTR_LOG_FILE_MODE 0644
 #define SIFTR_DISABLE 0
 #define SIFTR_ENABLE 1
 
 /*
  * Hard upper limit on the length of log messages. Bump this up if you add new
  * data fields such that the line length could exceed the below value.
  */
 #define MAX_LOG_MSG_LEN 200
 /* XXX: Make this a sysctl tunable. */
 #define SIFTR_ALQ_BUFLEN (1000*MAX_LOG_MSG_LEN)
 
 /*
  * 1 byte for IP version
  * IPv4: src/dst IP (4+4) + src/dst port (2+2) = 12 bytes
  * IPv6: src/dst IP (16+16) + src/dst port (2+2) = 36 bytes
  */
 #ifdef SIFTR_IPV6
 #define FLOW_KEY_LEN 37
 #else
 #define FLOW_KEY_LEN 13
 #endif
 
 #ifdef SIFTR_IPV6
 #define SIFTR_IPMODE 6
 #else
 #define SIFTR_IPMODE 4
 #endif
 
 /* useful macros */
 #define CAST_PTR_INT(X) (*((int*)(X)))
 
 #define UPPER_SHORT(X)	(((X) & 0xFFFF0000) >> 16)
 #define LOWER_SHORT(X)	((X) & 0x0000FFFF)
 
 #define FIRST_OCTET(X)	(((X) & 0xFF000000) >> 24)
 #define SECOND_OCTET(X)	(((X) & 0x00FF0000) >> 16)
 #define THIRD_OCTET(X)	(((X) & 0x0000FF00) >> 8)
 #define FOURTH_OCTET(X)	((X) & 0x000000FF)
 
 static MALLOC_DEFINE(M_SIFTR, "siftr", "dynamic memory used by SIFTR");
 static MALLOC_DEFINE(M_SIFTR_PKTNODE, "siftr_pktnode",
     "SIFTR pkt_node struct");
 static MALLOC_DEFINE(M_SIFTR_HASHNODE, "siftr_hashnode",
     "SIFTR flow_hash_node struct");
 
 /* Used as links in the pkt manager queue. */
 struct pkt_node {
 	/* Timestamp of pkt as noted in the pfil hook. */
 	struct timeval		tval;
 	/* Direction pkt is travelling; either PFIL_IN or PFIL_OUT. */
 	uint8_t			direction;
 	/* IP version pkt_node relates to; either INP_IPV4 or INP_IPV6. */
 	uint8_t			ipver;
 	/* Hash of the pkt which triggered the log message. */
 	uint32_t		hash;
 	/* Local/foreign IP address. */
 #ifdef SIFTR_IPV6
 	uint32_t		ip_laddr[4];
 	uint32_t		ip_faddr[4];
 #else
 	uint8_t			ip_laddr[4];
 	uint8_t			ip_faddr[4];
 #endif
 	/* Local TCP port. */
 	uint16_t		tcp_localport;
 	/* Foreign TCP port. */
 	uint16_t		tcp_foreignport;
 	/* Congestion Window (bytes). */
 	u_long			snd_cwnd;
 	/* Sending Window (bytes). */
 	u_long			snd_wnd;
 	/* Receive Window (bytes). */
 	u_long			rcv_wnd;
 	/* Unused (was: Bandwidth Controlled Window (bytes)). */
 	u_long			snd_bwnd;
 	/* Slow Start Threshold (bytes). */
 	u_long			snd_ssthresh;
 	/* Current state of the TCP FSM. */
 	int			conn_state;
 	/* Max Segment Size (bytes). */
 	u_int			max_seg_size;
 	/*
 	 * Smoothed RTT stored as found in the TCP control block
 	 * in units of (TCP_RTT_SCALE*hz).
 	 */
 	int			smoothed_rtt;
 	/* Is SACK enabled? */
 	u_char			sack_enabled;
 	/* Window scaling for snd window. */
 	u_char			snd_scale;
 	/* Window scaling for recv window. */
 	u_char			rcv_scale;
 	/* TCP control block flags. */
 	u_int			flags;
 	/* Retransmit timeout length. */
 	int			rxt_length;
 	/* Size of the TCP send buffer in bytes. */
 	u_int			snd_buf_hiwater;
 	/* Current num bytes in the send socket buffer. */
 	u_int			snd_buf_cc;
 	/* Size of the TCP receive buffer in bytes. */
 	u_int			rcv_buf_hiwater;
 	/* Current num bytes in the receive socket buffer. */
 	u_int			rcv_buf_cc;
 	/* Number of bytes inflight that we are waiting on ACKs for. */
 	u_int			sent_inflight_bytes;
 	/* Number of segments currently in the reassembly queue. */
 	int			t_segqlen;
 	/* Flowid for the connection. */
 	u_int			flowid;	
 	/* Flow type for the connection. */
 	u_int			flowtype;	
 	/* Link to next pkt_node in the list. */
 	STAILQ_ENTRY(pkt_node)	nodes;
 };
 
 struct flow_hash_node
 {
 	uint16_t counter;
 	uint8_t key[FLOW_KEY_LEN];
 	LIST_ENTRY(flow_hash_node) nodes;
 };
 
 struct siftr_stats
 {
 	/* # TCP pkts seen by the SIFTR PFIL hooks, including any skipped. */
 	uint64_t n_in;
 	uint64_t n_out;
 	/* # pkts skipped due to failed malloc calls. */
 	uint32_t nskip_in_malloc;
 	uint32_t nskip_out_malloc;
 	/* # pkts skipped due to failed mtx acquisition. */
 	uint32_t nskip_in_mtx;
 	uint32_t nskip_out_mtx;
 	/* # pkts skipped due to failed inpcb lookups. */
 	uint32_t nskip_in_inpcb;
 	uint32_t nskip_out_inpcb;
 	/* # pkts skipped due to failed tcpcb lookups. */
 	uint32_t nskip_in_tcpcb;
 	uint32_t nskip_out_tcpcb;
 	/* # pkts skipped due to stack reinjection. */
 	uint32_t nskip_in_dejavu;
 	uint32_t nskip_out_dejavu;
 };
 
 static DPCPU_DEFINE(struct siftr_stats, ss);
 
 static volatile unsigned int siftr_exit_pkt_manager_thread = 0;
 static unsigned int siftr_enabled = 0;
 static unsigned int siftr_pkts_per_log = 1;
 static unsigned int siftr_generate_hashes = 0;
 /* static unsigned int siftr_binary_log = 0; */
 static char siftr_logfile[PATH_MAX] = "/var/log/siftr.log";
 static char siftr_logfile_shadow[PATH_MAX] = "/var/log/siftr.log";
 static u_long siftr_hashmask;
 STAILQ_HEAD(pkthead, pkt_node) pkt_queue = STAILQ_HEAD_INITIALIZER(pkt_queue);
 LIST_HEAD(listhead, flow_hash_node) *counter_hash;
 static int wait_for_pkt;
 static struct alq *siftr_alq = NULL;
 static struct mtx siftr_pkt_queue_mtx;
 static struct mtx siftr_pkt_mgr_mtx;
 static struct thread *siftr_pkt_manager_thr = NULL;
 /*
  * pfil.h defines PFIL_IN as 1 and PFIL_OUT as 2,
  * which we use as an index into this array.
  */
 static char direction[3] = {'\0', 'i','o'};
 
 /* Required function prototypes. */
 static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS);
 static int siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS);
 
 
 /* Declare the net.inet.siftr sysctl tree and populate it. */
 
 SYSCTL_DECL(_net_inet_siftr);
 
 SYSCTL_NODE(_net_inet, OID_AUTO, siftr, CTLFLAG_RW, NULL,
     "siftr related settings");
 
 SYSCTL_PROC(_net_inet_siftr, OID_AUTO, enabled, CTLTYPE_UINT|CTLFLAG_RW,
     &siftr_enabled, 0, &siftr_sysctl_enabled_handler, "IU",
     "switch siftr module operations on/off");
 
 SYSCTL_PROC(_net_inet_siftr, OID_AUTO, logfile, CTLTYPE_STRING|CTLFLAG_RW,
     &siftr_logfile_shadow, sizeof(siftr_logfile_shadow), &siftr_sysctl_logfile_name_handler,
     "A", "file to save siftr log messages to");
 
 SYSCTL_UINT(_net_inet_siftr, OID_AUTO, ppl, CTLFLAG_RW,
     &siftr_pkts_per_log, 1,
     "number of packets between generating a log message");
 
 SYSCTL_UINT(_net_inet_siftr, OID_AUTO, genhashes, CTLFLAG_RW,
     &siftr_generate_hashes, 0,
     "enable packet hash generation");
 
 /* XXX: TODO
 SYSCTL_UINT(_net_inet_siftr, OID_AUTO, binary, CTLFLAG_RW,
     &siftr_binary_log, 0,
     "write log files in binary instead of ascii");
 */
 
 
 /* Begin functions. */
 
 static void
 siftr_process_pkt(struct pkt_node * pkt_node)
 {
 	struct flow_hash_node *hash_node;
 	struct listhead *counter_list;
 	struct siftr_stats *ss;
 	struct ale *log_buf;
 	uint8_t key[FLOW_KEY_LEN];
 	uint8_t found_match, key_offset;
 
 	hash_node = NULL;
 	ss = DPCPU_PTR(ss);
 	found_match = 0;
 	key_offset = 1;
 
 	/*
 	 * Create the key that will be used to create a hash index
 	 * into our hash table. Our key consists of:
 	 * ipversion, localip, localport, foreignip, foreignport
 	 */
 	key[0] = pkt_node->ipver;
 	memcpy(key + key_offset, &pkt_node->ip_laddr,
 	    sizeof(pkt_node->ip_laddr));
 	key_offset += sizeof(pkt_node->ip_laddr);
 	memcpy(key + key_offset, &pkt_node->tcp_localport,
 	    sizeof(pkt_node->tcp_localport));
 	key_offset += sizeof(pkt_node->tcp_localport);
 	memcpy(key + key_offset, &pkt_node->ip_faddr,
 	    sizeof(pkt_node->ip_faddr));
 	key_offset += sizeof(pkt_node->ip_faddr);
 	memcpy(key + key_offset, &pkt_node->tcp_foreignport,
 	    sizeof(pkt_node->tcp_foreignport));
 
 	counter_list = counter_hash +
 	    (hash32_buf(key, sizeof(key), 0) & siftr_hashmask);
 
 	/*
 	 * If the list is not empty i.e. the hash index has
 	 * been used by another flow previously.
 	 */
 	if (LIST_FIRST(counter_list) != NULL) {
 		/*
 		 * Loop through the hash nodes in the list.
 		 * There should normally only be 1 hash node in the list,
 		 * except if there have been collisions at the hash index
 		 * computed by hash32_buf().
 		 */
 		LIST_FOREACH(hash_node, counter_list, nodes) {
 			/*
 			 * Check if the key for the pkt we are currently
 			 * processing is the same as the key stored in the
 			 * hash node we are currently processing.
 			 * If they are the same, then we've found the
 			 * hash node that stores the counter for the flow
 			 * the pkt belongs to.
 			 */
 			if (memcmp(hash_node->key, key, sizeof(key)) == 0) {
 				found_match = 1;
 				break;
 			}
 		}
 	}
 
 	/* If this flow hash hasn't been seen before or we have a collision. */
 	if (hash_node == NULL || !found_match) {
 		/* Create a new hash node to store the flow's counter. */
 		hash_node = malloc(sizeof(struct flow_hash_node),
 		    M_SIFTR_HASHNODE, M_WAITOK);
 
 		if (hash_node != NULL) {
 			/* Initialise our new hash node list entry. */
 			hash_node->counter = 0;
 			memcpy(hash_node->key, key, sizeof(key));
 			LIST_INSERT_HEAD(counter_list, hash_node, nodes);
 		} else {
 			/* Malloc failed. */
 			if (pkt_node->direction == PFIL_IN)
 				ss->nskip_in_malloc++;
 			else
 				ss->nskip_out_malloc++;
 
 			return;
 		}
 	} else if (siftr_pkts_per_log > 1) {
 		/*
 		 * Taking the remainder of the counter divided
 		 * by the current value of siftr_pkts_per_log
 		 * and storing that in counter provides a neat
 		 * way to modulate the frequency of log
 		 * messages being written to the log file.
 		 */
 		hash_node->counter = (hash_node->counter + 1) %
 		    siftr_pkts_per_log;
 
 		/*
 		 * If we have not seen enough packets since the last time
 		 * we wrote a log message for this connection, return.
 		 */
 		if (hash_node->counter > 0)
 			return;
 	}
 
 	log_buf = alq_getn(siftr_alq, MAX_LOG_MSG_LEN, ALQ_WAITOK);
 
 	if (log_buf == NULL)
 		return; /* Should only happen if the ALQ is shutting down. */
 
 #ifdef SIFTR_IPV6
 	pkt_node->ip_laddr[3] = ntohl(pkt_node->ip_laddr[3]);
 	pkt_node->ip_faddr[3] = ntohl(pkt_node->ip_faddr[3]);
 
 	if (pkt_node->ipver == INP_IPV6) { /* IPv6 packet */
 		pkt_node->ip_laddr[0] = ntohl(pkt_node->ip_laddr[0]);
 		pkt_node->ip_laddr[1] = ntohl(pkt_node->ip_laddr[1]);
 		pkt_node->ip_laddr[2] = ntohl(pkt_node->ip_laddr[2]);
 		pkt_node->ip_faddr[0] = ntohl(pkt_node->ip_faddr[0]);
 		pkt_node->ip_faddr[1] = ntohl(pkt_node->ip_faddr[1]);
 		pkt_node->ip_faddr[2] = ntohl(pkt_node->ip_faddr[2]);
 
 		/* Construct an IPv6 log message. */
 		log_buf->ae_bytesused = snprintf(log_buf->ae_data,
 		    MAX_LOG_MSG_LEN,
 		    "%c,0x%08x,%zd.%06ld,%x:%x:%x:%x:%x:%x:%x:%x,%u,%x:%x:%x:"
 		    "%x:%x:%x:%x:%x,%u,%ld,%ld,%ld,%ld,%ld,%u,%u,%u,%u,%u,%u,"
 		    "%u,%d,%u,%u,%u,%u,%u,%u,%u,%u\n",
 		    direction[pkt_node->direction],
 		    pkt_node->hash,
 		    pkt_node->tval.tv_sec,
 		    pkt_node->tval.tv_usec,
 		    UPPER_SHORT(pkt_node->ip_laddr[0]),
 		    LOWER_SHORT(pkt_node->ip_laddr[0]),
 		    UPPER_SHORT(pkt_node->ip_laddr[1]),
 		    LOWER_SHORT(pkt_node->ip_laddr[1]),
 		    UPPER_SHORT(pkt_node->ip_laddr[2]),
 		    LOWER_SHORT(pkt_node->ip_laddr[2]),
 		    UPPER_SHORT(pkt_node->ip_laddr[3]),
 		    LOWER_SHORT(pkt_node->ip_laddr[3]),
 		    ntohs(pkt_node->tcp_localport),
 		    UPPER_SHORT(pkt_node->ip_faddr[0]),
 		    LOWER_SHORT(pkt_node->ip_faddr[0]),
 		    UPPER_SHORT(pkt_node->ip_faddr[1]),
 		    LOWER_SHORT(pkt_node->ip_faddr[1]),
 		    UPPER_SHORT(pkt_node->ip_faddr[2]),
 		    LOWER_SHORT(pkt_node->ip_faddr[2]),
 		    UPPER_SHORT(pkt_node->ip_faddr[3]),
 		    LOWER_SHORT(pkt_node->ip_faddr[3]),
 		    ntohs(pkt_node->tcp_foreignport),
 		    pkt_node->snd_ssthresh,
 		    pkt_node->snd_cwnd,
 		    pkt_node->snd_bwnd,
 		    pkt_node->snd_wnd,
 		    pkt_node->rcv_wnd,
 		    pkt_node->snd_scale,
 		    pkt_node->rcv_scale,
 		    pkt_node->conn_state,
 		    pkt_node->max_seg_size,
 		    pkt_node->smoothed_rtt,
 		    pkt_node->sack_enabled,
 		    pkt_node->flags,
 		    pkt_node->rxt_length,
 		    pkt_node->snd_buf_hiwater,
 		    pkt_node->snd_buf_cc,
 		    pkt_node->rcv_buf_hiwater,
 		    pkt_node->rcv_buf_cc,
 		    pkt_node->sent_inflight_bytes,
 		    pkt_node->t_segqlen,
 		    pkt_node->flowid,
 		    pkt_node->flowtype);
 	} else { /* IPv4 packet */
 		pkt_node->ip_laddr[0] = FIRST_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_laddr[1] = SECOND_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_laddr[2] = THIRD_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_laddr[3] = FOURTH_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_faddr[0] = FIRST_OCTET(pkt_node->ip_faddr[3]);
 		pkt_node->ip_faddr[1] = SECOND_OCTET(pkt_node->ip_faddr[3]);
 		pkt_node->ip_faddr[2] = THIRD_OCTET(pkt_node->ip_faddr[3]);
 		pkt_node->ip_faddr[3] = FOURTH_OCTET(pkt_node->ip_faddr[3]);
 #endif /* SIFTR_IPV6 */
 
 		/* Construct an IPv4 log message. */
 		log_buf->ae_bytesused = snprintf(log_buf->ae_data,
 		    MAX_LOG_MSG_LEN,
 		    "%c,0x%08x,%jd.%06ld,%u.%u.%u.%u,%u,%u.%u.%u.%u,%u,%ld,%ld,"
 		    "%ld,%ld,%ld,%u,%u,%u,%u,%u,%u,%u,%d,%u,%u,%u,%u,%u,%u,%u,%u\n",
 		    direction[pkt_node->direction],
 		    pkt_node->hash,
 		    (intmax_t)pkt_node->tval.tv_sec,
 		    pkt_node->tval.tv_usec,
 		    pkt_node->ip_laddr[0],
 		    pkt_node->ip_laddr[1],
 		    pkt_node->ip_laddr[2],
 		    pkt_node->ip_laddr[3],
 		    ntohs(pkt_node->tcp_localport),
 		    pkt_node->ip_faddr[0],
 		    pkt_node->ip_faddr[1],
 		    pkt_node->ip_faddr[2],
 		    pkt_node->ip_faddr[3],
 		    ntohs(pkt_node->tcp_foreignport),
 		    pkt_node->snd_ssthresh,
 		    pkt_node->snd_cwnd,
 		    pkt_node->snd_bwnd,
 		    pkt_node->snd_wnd,
 		    pkt_node->rcv_wnd,
 		    pkt_node->snd_scale,
 		    pkt_node->rcv_scale,
 		    pkt_node->conn_state,
 		    pkt_node->max_seg_size,
 		    pkt_node->smoothed_rtt,
 		    pkt_node->sack_enabled,
 		    pkt_node->flags,
 		    pkt_node->rxt_length,
 		    pkt_node->snd_buf_hiwater,
 		    pkt_node->snd_buf_cc,
 		    pkt_node->rcv_buf_hiwater,
 		    pkt_node->rcv_buf_cc,
 		    pkt_node->sent_inflight_bytes,
 		    pkt_node->t_segqlen,
 		    pkt_node->flowid,
 		    pkt_node->flowtype);
 #ifdef SIFTR_IPV6
 	}
 #endif
 
 	alq_post_flags(siftr_alq, log_buf, 0);
 }
 
 
 static void
 siftr_pkt_manager_thread(void *arg)
 {
 	STAILQ_HEAD(pkthead, pkt_node) tmp_pkt_queue =
 	    STAILQ_HEAD_INITIALIZER(tmp_pkt_queue);
 	struct pkt_node *pkt_node, *pkt_node_temp;
 	uint8_t draining;
 
 	draining = 2;
 
 	mtx_lock(&siftr_pkt_mgr_mtx);
 
 	/* draining == 0 when queue has been flushed and it's safe to exit. */
 	while (draining) {
 		/*
 		 * Sleep until we are signalled to wake because thread has
 		 * been told to exit or until 1 tick has passed.
 		 */
 		mtx_sleep(&wait_for_pkt, &siftr_pkt_mgr_mtx, PWAIT, "pktwait",
 		    1);
 
 		/* Gain exclusive access to the pkt_node queue. */
 		mtx_lock(&siftr_pkt_queue_mtx);
 
 		/*
 		 * Move pkt_queue to tmp_pkt_queue, which leaves
 		 * pkt_queue empty and ready to receive more pkt_nodes.
 		 */
 		STAILQ_CONCAT(&tmp_pkt_queue, &pkt_queue);
 
 		/*
 		 * We've finished making changes to the list. Unlock it
 		 * so the pfil hooks can continue queuing pkt_nodes.
 		 */
 		mtx_unlock(&siftr_pkt_queue_mtx);
 
 		/*
 		 * We can't hold a mutex whilst calling siftr_process_pkt
 		 * because ALQ might sleep waiting for buffer space.
 		 */
 		mtx_unlock(&siftr_pkt_mgr_mtx);
 
 		/* Flush all pkt_nodes to the log file. */
 		STAILQ_FOREACH_SAFE(pkt_node, &tmp_pkt_queue, nodes,
 		    pkt_node_temp) {
 			siftr_process_pkt(pkt_node);
 			STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes);
 			free(pkt_node, M_SIFTR_PKTNODE);
 		}
 
 		KASSERT(STAILQ_EMPTY(&tmp_pkt_queue),
 		    ("SIFTR tmp_pkt_queue not empty after flush"));
 
 		mtx_lock(&siftr_pkt_mgr_mtx);
 
 		/*
 		 * If siftr_exit_pkt_manager_thread gets set during the window
 		 * where we are draining the tmp_pkt_queue above, there might
 		 * still be pkts in pkt_queue that need to be drained.
 		 * Allow one further iteration to occur after
 		 * siftr_exit_pkt_manager_thread has been set to ensure
 		 * pkt_queue is completely empty before we kill the thread.
 		 *
 		 * siftr_exit_pkt_manager_thread is set only after the pfil
 		 * hooks have been removed, so only 1 extra iteration
 		 * is needed to drain the queue.
 		 */
 		if (siftr_exit_pkt_manager_thread)
 			draining--;
 	}
 
 	mtx_unlock(&siftr_pkt_mgr_mtx);
 
 	/* Calls wakeup on this thread's struct thread ptr. */
 	kthread_exit();
 }
 
 
 static uint32_t
 hash_pkt(struct mbuf *m, uint32_t offset)
 {
 	uint32_t hash;
 
 	hash = 0;
 
 	while (m != NULL && offset > m->m_len) {
 		/*
 		 * The IP packet payload does not start in this mbuf, so
 		 * need to figure out which mbuf it starts in and what offset
 		 * into the mbuf's data region the payload starts at.
 		 */
 		offset -= m->m_len;
 		m = m->m_next;
 	}
 
 	while (m != NULL) {
 		/* Ensure there is data in the mbuf */
 		if ((m->m_len - offset) > 0)
 			hash = hash32_buf(m->m_data + offset,
 			    m->m_len - offset, hash);
 
 		m = m->m_next;
 		offset = 0;
         }
 
 	return (hash);
 }
 
 
 /*
  * Check if a given mbuf has the SIFTR mbuf tag. If it does, log the fact that
  * it's a reinjected packet and return. If it doesn't, tag the mbuf and return.
  * Return value >0 means the caller should skip processing this mbuf.
  */
 static inline int
 siftr_chkreinject(struct mbuf *m, int dir, struct siftr_stats *ss)
 {
 	if (m_tag_locate(m, PACKET_COOKIE_SIFTR, PACKET_TAG_SIFTR, NULL)
 	    != NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_dejavu++;
 		else
 			ss->nskip_out_dejavu++;
 
 		return (1);
 	} else {
 		struct m_tag *tag = m_tag_alloc(PACKET_COOKIE_SIFTR,
 		    PACKET_TAG_SIFTR, 0, M_NOWAIT);
 		if (tag == NULL) {
 			if (dir == PFIL_IN)
 				ss->nskip_in_malloc++;
 			else
 				ss->nskip_out_malloc++;
 
 			return (1);
 		}
 
 		m_tag_prepend(m, tag);
 	}
 
 	return (0);
 }
 
 
 /*
  * Look up an inpcb for a packet. Return the inpcb pointer if found, or NULL
  * otherwise.
  */
 static inline struct inpcb *
 siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport,
     uint16_t dport, int dir, struct siftr_stats *ss)
 {
 	struct inpcb *inp;
 
 	/* We need the tcbinfo lock. */
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 
 	if (dir == PFIL_IN)
 		inp = (ipver == INP_IPV4 ?
 		    in_pcblookup(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst,
 		    dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif)
 		    :
 #ifdef SIFTR_IPV6
 		    in6_pcblookup(&V_tcbinfo,
 		    &((struct ip6_hdr *)ip)->ip6_src, sport,
 		    &((struct ip6_hdr *)ip)->ip6_dst, dport, INPLOOKUP_RLOCKPCB,
 		    m->m_pkthdr.rcvif)
 #else
 		    NULL
 #endif
 		    );
 
 	else
 		inp = (ipver == INP_IPV4 ?
 		    in_pcblookup(&V_tcbinfo, ip->ip_dst, dport, ip->ip_src,
 		    sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif)
 		    :
 #ifdef SIFTR_IPV6
 		    in6_pcblookup(&V_tcbinfo,
 		    &((struct ip6_hdr *)ip)->ip6_dst, dport,
 		    &((struct ip6_hdr *)ip)->ip6_src, sport, INPLOOKUP_RLOCKPCB,
 		    m->m_pkthdr.rcvif)
 #else
 		    NULL
 #endif
 		    );
 
 	/* If we can't find the inpcb, bail. */
 	if (inp == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_inpcb++;
 		else
 			ss->nskip_out_inpcb++;
 	}
 
 	return (inp);
 }
 
 
 static inline void
 siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp,
     int ipver, int dir, int inp_locally_locked)
 {
 #ifdef SIFTR_IPV6
 	if (ipver == INP_IPV4) {
 		pn->ip_laddr[3] = inp->inp_laddr.s_addr;
 		pn->ip_faddr[3] = inp->inp_faddr.s_addr;
 #else
 		*((uint32_t *)pn->ip_laddr) = inp->inp_laddr.s_addr;
 		*((uint32_t *)pn->ip_faddr) = inp->inp_faddr.s_addr;
 #endif
 #ifdef SIFTR_IPV6
 	} else {
 		pn->ip_laddr[0] = inp->in6p_laddr.s6_addr32[0];
 		pn->ip_laddr[1] = inp->in6p_laddr.s6_addr32[1];
 		pn->ip_laddr[2] = inp->in6p_laddr.s6_addr32[2];
 		pn->ip_laddr[3] = inp->in6p_laddr.s6_addr32[3];
 		pn->ip_faddr[0] = inp->in6p_faddr.s6_addr32[0];
 		pn->ip_faddr[1] = inp->in6p_faddr.s6_addr32[1];
 		pn->ip_faddr[2] = inp->in6p_faddr.s6_addr32[2];
 		pn->ip_faddr[3] = inp->in6p_faddr.s6_addr32[3];
 	}
 #endif
 	pn->tcp_localport = inp->inp_lport;
 	pn->tcp_foreignport = inp->inp_fport;
 	pn->snd_cwnd = tp->snd_cwnd;
 	pn->snd_wnd = tp->snd_wnd;
 	pn->rcv_wnd = tp->rcv_wnd;
 	pn->snd_bwnd = 0;		/* Unused, kept for compat. */
 	pn->snd_ssthresh = tp->snd_ssthresh;
 	pn->snd_scale = tp->snd_scale;
 	pn->rcv_scale = tp->rcv_scale;
 	pn->conn_state = tp->t_state;
 	pn->max_seg_size = tp->t_maxseg;
 	pn->smoothed_rtt = tp->t_srtt;
 	pn->sack_enabled = (tp->t_flags & TF_SACK_PERMIT) != 0;
 	pn->flags = tp->t_flags;
 	pn->rxt_length = tp->t_rxtcur;
 	pn->snd_buf_hiwater = inp->inp_socket->so_snd.sb_hiwat;
 	pn->snd_buf_cc = sbused(&inp->inp_socket->so_snd);
 	pn->rcv_buf_hiwater = inp->inp_socket->so_rcv.sb_hiwat;
 	pn->rcv_buf_cc = sbused(&inp->inp_socket->so_rcv);
 	pn->sent_inflight_bytes = tp->snd_max - tp->snd_una;
 	pn->t_segqlen = tp->t_segqlen;
 	pn->flowid = inp->inp_flowid;
 	pn->flowtype = inp->inp_flowtype;
 
 	/* We've finished accessing the tcb so release the lock. */
 	if (inp_locally_locked)
 		INP_RUNLOCK(inp);
 
 	pn->ipver = ipver;
 	pn->direction = dir;
 
 	/*
 	 * Significantly more accurate than using getmicrotime(), but slower!
 	 * Gives true microsecond resolution at the expense of a hit to
 	 * maximum pps throughput processing when SIFTR is loaded and enabled.
 	 */
 	microtime(&pn->tval);
 	TCP_PROBE1(siftr, &pn);
 
 }
 
 
 /*
  * pfil hook that is called for each IPv4 packet making its way through the
  * stack in either direction.
  * The pfil subsystem holds a non-sleepable mutex somewhere when
  * calling our hook function, so we can't sleep at all.
  * It's very important to use the M_NOWAIT flag with all function calls
  * that support it so that they won't sleep, otherwise you get a panic.
  */
 static int
 siftr_chkpkt(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	struct pkt_node *pn;
 	struct ip *ip;
 	struct tcphdr *th;
 	struct tcpcb *tp;
 	struct siftr_stats *ss;
 	unsigned int ip_hl;
 	int inp_locally_locked;
 
 	inp_locally_locked = 0;
 	ss = DPCPU_PTR(ss);
 
 	/*
 	 * m_pullup is not required here because ip_{input|output}
 	 * already do the heavy lifting for us.
 	 */
 
 	ip = mtod(*m, struct ip *);
 
 	/* Only continue processing if the packet is TCP. */
 	if (ip->ip_p != IPPROTO_TCP)
 		goto ret;
 
 	/*
 	 * If a kernel subsystem reinjects packets into the stack, our pfil
 	 * hook will be called multiple times for the same packet.
 	 * Make sure we only process unique packets.
 	 */
 	if (siftr_chkreinject(*m, dir, ss))
 		goto ret;
 
 	if (dir == PFIL_IN)
 		ss->n_in++;
 	else
 		ss->n_out++;
 
 	/*
 	 * Create a tcphdr struct starting at the correct offset
 	 * in the IP packet. ip->ip_hl gives the ip header length
 	 * in 4-byte words, so multiply it to get the size in bytes.
 	 */
 	ip_hl = (ip->ip_hl << 2);
 	th = (struct tcphdr *)((caddr_t)ip + ip_hl);
 
 	/*
 	 * If the pfil hooks don't provide a pointer to the
 	 * inpcb, we need to find it ourselves and lock it.
 	 */
 	if (!inp) {
 		/* Find the corresponding inpcb for this pkt. */
 		inp = siftr_findinpcb(INP_IPV4, ip, *m, th->th_sport,
 		    th->th_dport, dir, ss);
 
 		if (inp == NULL)
 			goto ret;
 		else
 			inp_locally_locked = 1;
 	}
 
 	INP_LOCK_ASSERT(inp);
 
 	/* Find the TCP control block that corresponds with this packet */
 	tp = intotcpcb(inp);
 
 	/*
 	 * If we can't find the TCP control block (happens occasionaly for a
 	 * packet sent during the shutdown phase of a TCP connection),
 	 * or we're in the timewait state, bail
 	 */
 	if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_tcpcb++;
 		else
 			ss->nskip_out_tcpcb++;
 
 		goto inp_unlock;
 	}
 
 	pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO);
 
 	if (pn == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_malloc++;
 		else
 			ss->nskip_out_malloc++;
 
 		goto inp_unlock;
 	}
 
 	siftr_siftdata(pn, inp, tp, INP_IPV4, dir, inp_locally_locked);
 
 	if (siftr_generate_hashes) {
 		if ((*m)->m_pkthdr.csum_flags & CSUM_TCP) {
 			/*
 			 * For outbound packets, the TCP checksum isn't
 			 * calculated yet. This is a problem for our packet
 			 * hashing as the receiver will calc a different hash
 			 * to ours if we don't include the correct TCP checksum
 			 * in the bytes being hashed. To work around this
 			 * problem, we manually calc the TCP checksum here in
 			 * software. We unset the CSUM_TCP flag so the lower
 			 * layers don't recalc it.
 			 */
 			(*m)->m_pkthdr.csum_flags &= ~CSUM_TCP;
 
 			/*
 			 * Calculate the TCP checksum in software and assign
 			 * to correct TCP header field, which will follow the
 			 * packet mbuf down the stack. The trick here is that
 			 * tcp_output() sets th->th_sum to the checksum of the
 			 * pseudo header for us already. Because of the nature
 			 * of the checksumming algorithm, we can sum over the
 			 * entire IP payload (i.e. TCP header and data), which
 			 * will include the already calculated pseduo header
 			 * checksum, thus giving us the complete TCP checksum.
 			 *
 			 * To put it in simple terms, if checksum(1,2,3,4)=10,
 			 * then checksum(1,2,3,4,5) == checksum(10,5).
 			 * This property is what allows us to "cheat" and
 			 * checksum only the IP payload which has the TCP
 			 * th_sum field populated with the pseudo header's
 			 * checksum, and not need to futz around checksumming
 			 * pseudo header bytes and TCP header/data in one hit.
 			 * Refer to RFC 1071 for more info.
 			 *
 			 * NB: in_cksum_skip(struct mbuf *m, int len, int skip)
 			 * in_cksum_skip 2nd argument is NOT the number of
 			 * bytes to read from the mbuf at "skip" bytes offset
 			 * from the start of the mbuf (very counter intuitive!).
 			 * The number of bytes to read is calculated internally
 			 * by the function as len-skip i.e. to sum over the IP
 			 * payload (TCP header + data) bytes, it is INCORRECT
 			 * to call the function like this:
 			 * in_cksum_skip(at, ip->ip_len - offset, offset)
 			 * Rather, it should be called like this:
 			 * in_cksum_skip(at, ip->ip_len, offset)
 			 * which means read "ip->ip_len - offset" bytes from
 			 * the mbuf cluster "at" at offset "offset" bytes from
 			 * the beginning of the "at" mbuf's data pointer.
 			 */
 			th->th_sum = in_cksum_skip(*m, ntohs(ip->ip_len),
 			    ip_hl);
 		}
 
 		/*
 		 * XXX: Having to calculate the checksum in software and then
 		 * hash over all bytes is really inefficient. Would be nice to
 		 * find a way to create the hash and checksum in the same pass
 		 * over the bytes.
 		 */
 		pn->hash = hash_pkt(*m, ip_hl);
 	}
 
 	mtx_lock(&siftr_pkt_queue_mtx);
 	STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes);
 	mtx_unlock(&siftr_pkt_queue_mtx);
 	goto ret;
 
 inp_unlock:
 	if (inp_locally_locked)
 		INP_RUNLOCK(inp);
 
 ret:
 	/* Returning 0 ensures pfil will not discard the pkt */
 	return (0);
 }
 
 
 #ifdef SIFTR_IPV6
 static int
 siftr_chkpkt6(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	struct pkt_node *pn;
 	struct ip6_hdr *ip6;
 	struct tcphdr *th;
 	struct tcpcb *tp;
 	struct siftr_stats *ss;
 	unsigned int ip6_hl;
 	int inp_locally_locked;
 
 	inp_locally_locked = 0;
 	ss = DPCPU_PTR(ss);
 
 	/*
 	 * m_pullup is not required here because ip6_{input|output}
 	 * already do the heavy lifting for us.
 	 */
 
 	ip6 = mtod(*m, struct ip6_hdr *);
 
 	/*
 	 * Only continue processing if the packet is TCP
 	 * XXX: We should follow the next header fields
 	 * as shown on Pg 6 RFC 2460, but right now we'll
 	 * only check pkts that have no extension headers.
 	 */
 	if (ip6->ip6_nxt != IPPROTO_TCP)
 		goto ret6;
 
 	/*
 	 * If a kernel subsystem reinjects packets into the stack, our pfil
 	 * hook will be called multiple times for the same packet.
 	 * Make sure we only process unique packets.
 	 */
 	if (siftr_chkreinject(*m, dir, ss))
 		goto ret6;
 
 	if (dir == PFIL_IN)
 		ss->n_in++;
 	else
 		ss->n_out++;
 
 	ip6_hl = sizeof(struct ip6_hdr);
 
 	/*
 	 * Create a tcphdr struct starting at the correct offset
 	 * in the ipv6 packet. ip->ip_hl gives the ip header length
 	 * in 4-byte words, so multiply it to get the size in bytes.
 	 */
 	th = (struct tcphdr *)((caddr_t)ip6 + ip6_hl);
 
 	/*
 	 * For inbound packets, the pfil hooks don't provide a pointer to the
 	 * inpcb, so we need to find it ourselves and lock it.
 	 */
 	if (!inp) {
 		/* Find the corresponding inpcb for this pkt. */
 		inp = siftr_findinpcb(INP_IPV6, (struct ip *)ip6, *m,
 		    th->th_sport, th->th_dport, dir, ss);
 
 		if (inp == NULL)
 			goto ret6;
 		else
 			inp_locally_locked = 1;
 	}
 
 	/* Find the TCP control block that corresponds with this packet. */
 	tp = intotcpcb(inp);
 
 	/*
 	 * If we can't find the TCP control block (happens occasionaly for a
 	 * packet sent during the shutdown phase of a TCP connection),
 	 * or we're in the timewait state, bail.
 	 */
 	if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_tcpcb++;
 		else
 			ss->nskip_out_tcpcb++;
 
 		goto inp_unlock6;
 	}
 
 	pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO);
 
 	if (pn == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_malloc++;
 		else
 			ss->nskip_out_malloc++;
 
 		goto inp_unlock6;
 	}
 
 	siftr_siftdata(pn, inp, tp, INP_IPV6, dir, inp_locally_locked);
 
 	/* XXX: Figure out how to generate hashes for IPv6 packets. */
 
 	mtx_lock(&siftr_pkt_queue_mtx);
 	STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes);
 	mtx_unlock(&siftr_pkt_queue_mtx);
 	goto ret6;
 
 inp_unlock6:
 	if (inp_locally_locked)
 		INP_RUNLOCK(inp);
 
 ret6:
 	/* Returning 0 ensures pfil will not discard the pkt. */
 	return (0);
 }
 #endif /* #ifdef SIFTR_IPV6 */
 
 
 static int
 siftr_pfil(int action)
 {
 	struct pfil_head *pfh_inet;
 #ifdef SIFTR_IPV6
 	struct pfil_head *pfh_inet6;
 #endif
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
 #ifdef SIFTR_IPV6
 		pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
 #endif
 
 		if (action == HOOK) {
 			pfil_add_hook(siftr_chkpkt, NULL,
 			    PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet);
 #ifdef SIFTR_IPV6
 			pfil_add_hook(siftr_chkpkt6, NULL,
 			    PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet6);
 #endif
 		} else if (action == UNHOOK) {
 			pfil_remove_hook(siftr_chkpkt, NULL,
 			    PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet);
 #ifdef SIFTR_IPV6
 			pfil_remove_hook(siftr_chkpkt6, NULL,
 			    PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet6);
 #endif
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (0);
 }
 
 
 static int
 siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct alq *new_alq;
 	int error;
 
 	error = sysctl_handle_string(oidp, arg1, arg2, req);
 
 	/* Check for error or same filename */
 	if (error != 0 || req->newptr == NULL ||
 	    strncmp(siftr_logfile, arg1, arg2) == 0)
 		goto done;
 
 	/* Filname changed */
 	error = alq_open(&new_alq, arg1, curthread->td_ucred,
 	    SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0);
 	if (error != 0)
 		goto done;
 
 	/*
 	 * If disabled, siftr_alq == NULL so we simply close
 	 * the alq as we've proved it can be opened.
 	 * If enabled, close the existing alq and switch the old
 	 * for the new.
 	 */
 	if (siftr_alq == NULL) {
 		alq_close(new_alq);
 	} else {
 		alq_close(siftr_alq);
 		siftr_alq = new_alq;
 	}
 
 	/* Update filename upon success */
 	strlcpy(siftr_logfile, arg1, arg2);
 done:
 	return (error);
 }
 
 static int
 siftr_manage_ops(uint8_t action)
 {
 	struct siftr_stats totalss;
 	struct timeval tval;
 	struct flow_hash_node *counter, *tmp_counter;
 	struct sbuf *s;
 	int i, key_index, ret, error;
 	uint32_t bytes_to_write, total_skipped_pkts;
 	uint16_t lport, fport;
 	uint8_t *key, ipver;
 
 #ifdef SIFTR_IPV6
 	uint32_t laddr[4];
 	uint32_t faddr[4];
 #else
 	uint8_t laddr[4];
 	uint8_t faddr[4];
 #endif
 
 	error = 0;
 	total_skipped_pkts = 0;
 
 	/* Init an autosizing sbuf that initially holds 200 chars. */
 	if ((s = sbuf_new(NULL, NULL, 200, SBUF_AUTOEXTEND)) == NULL)
 		return (-1);
 
 	if (action == SIFTR_ENABLE) {
 		/*
 		 * Create our alq
 		 * XXX: We should abort if alq_open fails!
 		 */
 		alq_open(&siftr_alq, siftr_logfile, curthread->td_ucred,
 		    SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0);
 
 		STAILQ_INIT(&pkt_queue);
 
 		DPCPU_ZERO(ss);
 
 		siftr_exit_pkt_manager_thread = 0;
 
 		ret = kthread_add(&siftr_pkt_manager_thread, NULL, NULL,
 		    &siftr_pkt_manager_thr, RFNOWAIT, 0,
 		    "siftr_pkt_manager_thr");
 
 		siftr_pfil(HOOK);
 
 		microtime(&tval);
 
 		sbuf_printf(s,
 		    "enable_time_secs=%jd\tenable_time_usecs=%06ld\t"
 		    "siftrver=%s\thz=%u\ttcp_rtt_scale=%u\tsysname=%s\t"
 		    "sysver=%u\tipmode=%u\n",
 		    (intmax_t)tval.tv_sec, tval.tv_usec, MODVERSION_STR, hz,
 		    TCP_RTT_SCALE, SYS_NAME, __FreeBSD_version, SIFTR_IPMODE);
 
 		sbuf_finish(s);
 		alq_writen(siftr_alq, sbuf_data(s), sbuf_len(s), ALQ_WAITOK);
 
 	} else if (action == SIFTR_DISABLE && siftr_pkt_manager_thr != NULL) {
 		/*
 		 * Remove the pfil hook functions. All threads currently in
 		 * the hook functions are allowed to exit before siftr_pfil()
 		 * returns.
 		 */
 		siftr_pfil(UNHOOK);
 
 		/* This will block until the pkt manager thread unlocks it. */
 		mtx_lock(&siftr_pkt_mgr_mtx);
 
 		/* Tell the pkt manager thread that it should exit now. */
 		siftr_exit_pkt_manager_thread = 1;
 
 		/*
 		 * Wake the pkt_manager thread so it realises that
 		 * siftr_exit_pkt_manager_thread == 1 and exits gracefully.
 		 * The wakeup won't be delivered until we unlock
 		 * siftr_pkt_mgr_mtx so this isn't racy.
 		 */
 		wakeup(&wait_for_pkt);
 
 		/* Wait for the pkt_manager thread to exit. */
 		mtx_sleep(siftr_pkt_manager_thr, &siftr_pkt_mgr_mtx, PWAIT,
 		    "thrwait", 0);
 
 		siftr_pkt_manager_thr = NULL;
 		mtx_unlock(&siftr_pkt_mgr_mtx);
 
 		totalss.n_in = DPCPU_VARSUM(ss, n_in);
 		totalss.n_out = DPCPU_VARSUM(ss, n_out);
 		totalss.nskip_in_malloc = DPCPU_VARSUM(ss, nskip_in_malloc);
 		totalss.nskip_out_malloc = DPCPU_VARSUM(ss, nskip_out_malloc);
 		totalss.nskip_in_mtx = DPCPU_VARSUM(ss, nskip_in_mtx);
 		totalss.nskip_out_mtx = DPCPU_VARSUM(ss, nskip_out_mtx);
 		totalss.nskip_in_tcpcb = DPCPU_VARSUM(ss, nskip_in_tcpcb);
 		totalss.nskip_out_tcpcb = DPCPU_VARSUM(ss, nskip_out_tcpcb);
 		totalss.nskip_in_inpcb = DPCPU_VARSUM(ss, nskip_in_inpcb);
 		totalss.nskip_out_inpcb = DPCPU_VARSUM(ss, nskip_out_inpcb);
 
 		total_skipped_pkts = totalss.nskip_in_malloc +
 		    totalss.nskip_out_malloc + totalss.nskip_in_mtx +
 		    totalss.nskip_out_mtx + totalss.nskip_in_tcpcb +
 		    totalss.nskip_out_tcpcb + totalss.nskip_in_inpcb +
 		    totalss.nskip_out_inpcb;
 
 		microtime(&tval);
 
 		sbuf_printf(s,
 		    "disable_time_secs=%jd\tdisable_time_usecs=%06ld\t"
 		    "num_inbound_tcp_pkts=%ju\tnum_outbound_tcp_pkts=%ju\t"
 		    "total_tcp_pkts=%ju\tnum_inbound_skipped_pkts_malloc=%u\t"
 		    "num_outbound_skipped_pkts_malloc=%u\t"
 		    "num_inbound_skipped_pkts_mtx=%u\t"
 		    "num_outbound_skipped_pkts_mtx=%u\t"
 		    "num_inbound_skipped_pkts_tcpcb=%u\t"
 		    "num_outbound_skipped_pkts_tcpcb=%u\t"
 		    "num_inbound_skipped_pkts_inpcb=%u\t"
 		    "num_outbound_skipped_pkts_inpcb=%u\t"
 		    "total_skipped_tcp_pkts=%u\tflow_list=",
 		    (intmax_t)tval.tv_sec,
 		    tval.tv_usec,
 		    (uintmax_t)totalss.n_in,
 		    (uintmax_t)totalss.n_out,
 		    (uintmax_t)(totalss.n_in + totalss.n_out),
 		    totalss.nskip_in_malloc,
 		    totalss.nskip_out_malloc,
 		    totalss.nskip_in_mtx,
 		    totalss.nskip_out_mtx,
 		    totalss.nskip_in_tcpcb,
 		    totalss.nskip_out_tcpcb,
 		    totalss.nskip_in_inpcb,
 		    totalss.nskip_out_inpcb,
 		    total_skipped_pkts);
 
 		/*
 		 * Iterate over the flow hash, printing a summary of each
 		 * flow seen and freeing any malloc'd memory.
 		 * The hash consists of an array of LISTs (man 3 queue).
 		 */
 		for (i = 0; i <= siftr_hashmask; i++) {
 			LIST_FOREACH_SAFE(counter, counter_hash + i, nodes,
 			    tmp_counter) {
 				key = counter->key;
 				key_index = 1;
 
 				ipver = key[0];
 
 				memcpy(laddr, key + key_index, sizeof(laddr));
 				key_index += sizeof(laddr);
 				memcpy(&lport, key + key_index, sizeof(lport));
 				key_index += sizeof(lport);
 				memcpy(faddr, key + key_index, sizeof(faddr));
 				key_index += sizeof(faddr);
 				memcpy(&fport, key + key_index, sizeof(fport));
 
 #ifdef SIFTR_IPV6
 				laddr[3] = ntohl(laddr[3]);
 				faddr[3] = ntohl(faddr[3]);
 
 				if (ipver == INP_IPV6) {
 					laddr[0] = ntohl(laddr[0]);
 					laddr[1] = ntohl(laddr[1]);
 					laddr[2] = ntohl(laddr[2]);
 					faddr[0] = ntohl(faddr[0]);
 					faddr[1] = ntohl(faddr[1]);
 					faddr[2] = ntohl(faddr[2]);
 
 					sbuf_printf(s,
 					    "%x:%x:%x:%x:%x:%x:%x:%x;%u-"
 					    "%x:%x:%x:%x:%x:%x:%x:%x;%u,",
 					    UPPER_SHORT(laddr[0]),
 					    LOWER_SHORT(laddr[0]),
 					    UPPER_SHORT(laddr[1]),
 					    LOWER_SHORT(laddr[1]),
 					    UPPER_SHORT(laddr[2]),
 					    LOWER_SHORT(laddr[2]),
 					    UPPER_SHORT(laddr[3]),
 					    LOWER_SHORT(laddr[3]),
 					    ntohs(lport),
 					    UPPER_SHORT(faddr[0]),
 					    LOWER_SHORT(faddr[0]),
 					    UPPER_SHORT(faddr[1]),
 					    LOWER_SHORT(faddr[1]),
 					    UPPER_SHORT(faddr[2]),
 					    LOWER_SHORT(faddr[2]),
 					    UPPER_SHORT(faddr[3]),
 					    LOWER_SHORT(faddr[3]),
 					    ntohs(fport));
 				} else {
 					laddr[0] = FIRST_OCTET(laddr[3]);
 					laddr[1] = SECOND_OCTET(laddr[3]);
 					laddr[2] = THIRD_OCTET(laddr[3]);
 					laddr[3] = FOURTH_OCTET(laddr[3]);
 					faddr[0] = FIRST_OCTET(faddr[3]);
 					faddr[1] = SECOND_OCTET(faddr[3]);
 					faddr[2] = THIRD_OCTET(faddr[3]);
 					faddr[3] = FOURTH_OCTET(faddr[3]);
 #endif
 					sbuf_printf(s,
 					    "%u.%u.%u.%u;%u-%u.%u.%u.%u;%u,",
 					    laddr[0],
 					    laddr[1],
 					    laddr[2],
 					    laddr[3],
 					    ntohs(lport),
 					    faddr[0],
 					    faddr[1],
 					    faddr[2],
 					    faddr[3],
 					    ntohs(fport));
 #ifdef SIFTR_IPV6
 				}
 #endif
 
 				free(counter, M_SIFTR_HASHNODE);
 			}
 
 			LIST_INIT(counter_hash + i);
 		}
 
 		sbuf_printf(s, "\n");
 		sbuf_finish(s);
 
 		i = 0;
 		do {
 			bytes_to_write = min(SIFTR_ALQ_BUFLEN, sbuf_len(s)-i);
 			alq_writen(siftr_alq, sbuf_data(s)+i, bytes_to_write, ALQ_WAITOK);
 			i += bytes_to_write;
 		} while (i < sbuf_len(s));
 
 		alq_close(siftr_alq);
 		siftr_alq = NULL;
 	}
 
 	sbuf_delete(s);
 
 	/*
 	 * XXX: Should be using ret to check if any functions fail
 	 * and set error appropriately
 	 */
 
 	return (error);
 }
 
 
 static int
 siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS)
 {
 	if (req->newptr == NULL)
 		goto skip;
 
 	/* If the value passed in isn't 0 or 1, return an error. */
 	if (CAST_PTR_INT(req->newptr) != 0 && CAST_PTR_INT(req->newptr) != 1)
 		return (1);
 
 	/* If we are changing state (0 to 1 or 1 to 0). */
 	if (CAST_PTR_INT(req->newptr) != siftr_enabled )
 		if (siftr_manage_ops(CAST_PTR_INT(req->newptr))) {
 			siftr_manage_ops(SIFTR_DISABLE);
 			return (1);
 		}
 
 skip:
 	return (sysctl_handle_int(oidp, arg1, arg2, req));
 }
 
 
 static void
 siftr_shutdown_handler(void *arg)
 {
 	siftr_manage_ops(SIFTR_DISABLE);
 }
 
 
 /*
  * Module is being unloaded or machine is shutting down. Take care of cleanup.
  */
 static int
 deinit_siftr(void)
 {
 	/* Cleanup. */
 	siftr_manage_ops(SIFTR_DISABLE);
 	hashdestroy(counter_hash, M_SIFTR, siftr_hashmask);
 	mtx_destroy(&siftr_pkt_queue_mtx);
 	mtx_destroy(&siftr_pkt_mgr_mtx);
 
 	return (0);
 }
 
 
 /*
  * Module has just been loaded into the kernel.
  */
 static int
 init_siftr(void)
 {
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, siftr_shutdown_handler, NULL,
 	    SHUTDOWN_PRI_FIRST);
 
 	/* Initialise our flow counter hash table. */
 	counter_hash = hashinit(SIFTR_EXPECTED_MAX_TCP_FLOWS, M_SIFTR,
 	    &siftr_hashmask);
 
 	mtx_init(&siftr_pkt_queue_mtx, "siftr_pkt_queue_mtx", NULL, MTX_DEF);
 	mtx_init(&siftr_pkt_mgr_mtx, "siftr_pkt_mgr_mtx", NULL, MTX_DEF);
 
 	/* Print message to the user's current terminal. */
 	uprintf("\nStatistical Information For TCP Research (SIFTR) %s\n"
 	    "          http://caia.swin.edu.au/urp/newtcp\n\n",
 	    MODVERSION_STR);
 
 	return (0);
 }
 
 
 /*
  * This is the function that is called to load and unload the module.
  * When the module is loaded, this function is called once with
  * "what" == MOD_LOAD
  * When the module is unloaded, this function is called twice with
  * "what" = MOD_QUIESCE first, followed by "what" = MOD_UNLOAD second
  * When the system is shut down e.g. CTRL-ALT-DEL or using the shutdown command,
  * this function is called once with "what" = MOD_SHUTDOWN
  * When the system is shut down, the handler isn't called until the very end
  * of the shutdown sequence i.e. after the disks have been synced.
  */
 static int
 siftr_load_handler(module_t mod, int what, void *arg)
 {
 	int ret;
 
 	switch (what) {
 	case MOD_LOAD:
 		ret = init_siftr();
 		break;
 
 	case MOD_QUIESCE:
 	case MOD_SHUTDOWN:
 		ret = deinit_siftr();
 		break;
 
 	case MOD_UNLOAD:
 		ret = 0;
 		break;
 
 	default:
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 
 static moduledata_t siftr_mod = {
 	.name = "siftr",
 	.evhand = siftr_load_handler,
 };
 
 /*
  * Param 1: name of the kernel module
  * Param 2: moduledata_t struct containing info about the kernel module
  *          and the execution entry point for the module
  * Param 3: From sysinit_sub_id enumeration in /usr/include/sys/kernel.h
  *          Defines the module initialisation order
  * Param 4: From sysinit_elem_order enumeration in /usr/include/sys/kernel.h
  *          Defines the initialisation order of this kld relative to others
  *          within the same subsystem as defined by param 3
  */
-DECLARE_MODULE(siftr, siftr_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(siftr, siftr_mod, SI_SUB_LAST, SI_ORDER_ANY);
 MODULE_DEPEND(siftr, alq, 1, 1, 1);
 MODULE_VERSION(siftr, MODVERSION);
Index: head/sys/ofed/drivers/infiniband/core/device.c
===================================================================
--- head/sys/ofed/drivers/infiniband/core/device.c	(revision 296687)
+++ head/sys/ofed/drivers/infiniband/core/device.c	(revision 296688)
@@ -1,793 +1,793 @@
 /*
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 
 #include "core_priv.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("core kernel InfiniBand API");
 MODULE_LICENSE("Dual BSD/GPL");
 
 struct ib_client_data {
 	struct list_head  list;
 	struct ib_client *client;
 	void *            data;
 };
 
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
 static LIST_HEAD(device_list);
 static LIST_HEAD(client_list);
 
 /*
  * device_mutex protects access to both device_list and client_list.
  * There's no real point to using multiple locks or something fancier
  * like an rwsem: we always access both lists, and we're always
  * modifying one list or the other list.  In any case this is not a
  * hot path so there's no point in trying to optimize.
  */
 static DEFINE_MUTEX(device_mutex);
 
 static int ib_device_check_mandatory(struct ib_device *device)
 {
 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
 	static const struct {
 		size_t offset;
 		char  *name;
 	} mandatory_table[] = {
 		IB_MANDATORY_FUNC(query_device),
 		IB_MANDATORY_FUNC(query_port),
 		IB_MANDATORY_FUNC(query_pkey),
 		IB_MANDATORY_FUNC(query_gid),
 		IB_MANDATORY_FUNC(alloc_pd),
 		IB_MANDATORY_FUNC(dealloc_pd),
 		IB_MANDATORY_FUNC(create_ah),
 		IB_MANDATORY_FUNC(destroy_ah),
 		IB_MANDATORY_FUNC(create_qp),
 		IB_MANDATORY_FUNC(modify_qp),
 		IB_MANDATORY_FUNC(destroy_qp),
 		IB_MANDATORY_FUNC(post_send),
 		IB_MANDATORY_FUNC(post_recv),
 		IB_MANDATORY_FUNC(create_cq),
 		IB_MANDATORY_FUNC(destroy_cq),
 		IB_MANDATORY_FUNC(poll_cq),
 		IB_MANDATORY_FUNC(req_notify_cq),
 		IB_MANDATORY_FUNC(get_dma_mr),
 		IB_MANDATORY_FUNC(dereg_mr)
 	};
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
 		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
 			printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
 			       device->name, mandatory_table[i].name);
 			return -EINVAL;
 		}
 	}
 
 	return 0;
 }
 
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
 	struct ib_device *device;
 
 	list_for_each_entry(device, &device_list, core_list)
 		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
 			return device;
 
 	return NULL;
 }
 
 
 static int alloc_name(char *name)
 {
 	unsigned long *inuse;
 	char buf[IB_DEVICE_NAME_MAX];
 	struct ib_device *device;
 	int i;
 
 	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
 	if (!inuse)
 		return -ENOMEM;
 
 	list_for_each_entry(device, &device_list, core_list) {
 		if (!sscanf(device->name, name, &i))
 			continue;
 		if (i < 0 || i >= PAGE_SIZE * 8)
 			continue;
 		snprintf(buf, sizeof buf, name, i);
 		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
 			set_bit(i, inuse);
 	}
 
 	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
 	free_page((unsigned long) inuse);
 	snprintf(buf, sizeof buf, name, i);
 
 	if (__ib_device_get_by_name(buf))
 		return -ENFILE;
 
 	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
 	return 0;
 }
 
 static int start_port(struct ib_device *device)
 {
 	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
 }
 
 
 static int end_port(struct ib_device *device)
 {
 	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
 		0 : device->phys_port_cnt;
 }
 
 /**
  * ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
  *
  * Low-level drivers should use ib_alloc_device() to allocate &struct
  * ib_device.  @size is the size of the structure to be allocated,
  * including any private data used by the low-level driver.
  * ib_dealloc_device() must be used to free structures allocated with
  * ib_alloc_device().
  */
 struct ib_device *ib_alloc_device(size_t size)
 {
 	struct ib_device *dev;
 
 	BUG_ON(size < sizeof (struct ib_device));
 
 	dev = kzalloc(size, GFP_KERNEL);
 	spin_lock_init(&dev->cmd_perf_lock);
 
 	return dev;
 }
 EXPORT_SYMBOL(ib_alloc_device);
 
 /**
  * ib_dealloc_device - free an IB device struct
  * @device:structure to free
  *
  * Free a structure allocated with ib_alloc_device().
  */
 void ib_dealloc_device(struct ib_device *device)
 {
 	if (device->reg_state == IB_DEV_UNINITIALIZED) {
 		kfree(device);
 		return;
 	}
 
 	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
 
 	kobject_put(&device->dev.kobj);
 }
 EXPORT_SYMBOL(ib_dealloc_device);
 
 static int add_client_context(struct ib_device *device, struct ib_client *client)
 {
 	struct ib_client_data *context;
 	unsigned long flags;
 
 	context = kmalloc(sizeof *context, GFP_KERNEL);
 	if (!context) {
 		printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
 		       device->name, client->name);
 		return -ENOMEM;
 	}
 
 	context->client = client;
 	context->data   = NULL;
 
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_add(&context->list, &device->client_data_list);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
 
 	return 0;
 }
 
 static int read_port_table_lengths(struct ib_device *device)
 {
 	struct ib_port_attr *tprops = NULL;
 	int num_ports, ret = -ENOMEM;
 	u8 port_index;
 
 	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
 	if (!tprops)
 		goto out;
 
 	num_ports = end_port(device) - start_port(device) + 1;
 
 	device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
 				       GFP_KERNEL);
 	device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
 				      GFP_KERNEL);
 	if (!device->pkey_tbl_len || !device->gid_tbl_len)
 		goto err;
 
 	for (port_index = 0; port_index < num_ports; ++port_index) {
 		ret = ib_query_port(device, port_index + start_port(device),
 					tprops);
 		if (ret)
 			goto err;
 		device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
 		device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
 	}
 
 	ret = 0;
 	goto out;
 
 err:
 	kfree(device->gid_tbl_len);
 	kfree(device->pkey_tbl_len);
 out:
 	kfree(tprops);
 	return ret;
 }
 
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
  *
  * Low-level drivers use ib_register_device() to register their
  * devices with the IB core.  All registered clients will receive a
  * callback for each device that is added. @device must be allocated
  * with ib_alloc_device().
  */
 int ib_register_device(struct ib_device *device,
 		       int (*port_callback)(struct ib_device *,
 					    u8, struct kobject *))
 {
 	int ret;
 
 	mutex_lock(&device_mutex);
 
 	if (strchr(device->name, '%')) {
 		ret = alloc_name(device->name);
 		if (ret)
 			goto out;
 	}
 
 	if (ib_device_check_mandatory(device)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	INIT_LIST_HEAD(&device->event_handler_list);
 	INIT_LIST_HEAD(&device->client_data_list);
 	spin_lock_init(&device->event_handler_lock);
 	spin_lock_init(&device->client_data_lock);
 
 	ret = read_port_table_lengths(device);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
 		       device->name);
 		goto out;
 	}
 
 	ret = ib_device_register_sysfs(device, port_callback);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
 		       device->name);
 		kfree(device->gid_tbl_len);
 		kfree(device->pkey_tbl_len);
 		goto out;
 	}
 
 	list_add_tail(&device->core_list, &device_list);
 
 	device->reg_state = IB_DEV_REGISTERED;
 
 	{
 		struct ib_client *client;
 
 		list_for_each_entry(client, &client_list, list)
 			if (client->add && !add_client_context(device, client))
 				client->add(device);
 	}
 
  out:
 	mutex_unlock(&device_mutex);
 	return ret;
 }
 EXPORT_SYMBOL(ib_register_device);
 
 /**
  * ib_unregister_device - Unregister an IB device
  * @device:Device to unregister
  *
  * Unregister an IB device.  All clients will receive a remove callback.
  */
 void ib_unregister_device(struct ib_device *device)
 {
 	struct ib_client *client;
 	struct ib_client_data *context, *tmp;
 	unsigned long flags;
 
 	mutex_lock(&device_mutex);
 
 	list_for_each_entry_reverse(client, &client_list, list)
 		if (client->remove)
 			client->remove(device);
 
 	list_del(&device->core_list);
 
 	kfree(device->gid_tbl_len);
 	kfree(device->pkey_tbl_len);
 
 	mutex_unlock(&device_mutex);
 
 	ib_device_unregister_sysfs(device);
 
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 		kfree(context);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
 
 	device->reg_state = IB_DEV_UNREGISTERED;
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
 /**
  * ib_register_client - Register an IB client
  * @client:Client to register
  *
  * Upper level users of the IB drivers can use ib_register_client() to
  * register callbacks for IB device addition and removal.  When an IB
  * device is added, each registered client's add method will be called
  * (in the order the clients were registered), and when a device is
  * removed, each client's remove method will be called (in the reverse
  * order that clients were registered).  In addition, when
  * ib_register_client() is called, the client will receive an add
  * callback for all devices already registered.
  */
 int ib_register_client(struct ib_client *client)
 {
 	struct ib_device *device;
 
 	mutex_lock(&device_mutex);
 
 	list_add_tail(&client->list, &client_list);
 	list_for_each_entry(device, &device_list, core_list)
 		if (client->add && !add_client_context(device, client))
 			client->add(device);
 
 	mutex_unlock(&device_mutex);
 
 	return 0;
 }
 EXPORT_SYMBOL(ib_register_client);
 
 /**
  * ib_unregister_client - Unregister an IB client
  * @client:Client to unregister
  *
  * Upper level users use ib_unregister_client() to remove their client
  * registration.  When ib_unregister_client() is called, the client
  * will receive a remove callback for each IB device still registered.
  */
 void ib_unregister_client(struct ib_client *client)
 {
 	struct ib_client_data *context, *tmp;
 	struct ib_device *device;
 	unsigned long flags;
 
 	mutex_lock(&device_mutex);
 
 	list_for_each_entry(device, &device_list, core_list) {
 		if (client->remove)
 			client->remove(device);
 
 		spin_lock_irqsave(&device->client_data_lock, flags);
 		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 			if (context->client == client) {
 				list_del(&context->list);
 				kfree(context);
 			}
 		spin_unlock_irqrestore(&device->client_data_lock, flags);
 	}
 	list_del(&client->list);
 
 	mutex_unlock(&device_mutex);
 }
 EXPORT_SYMBOL(ib_unregister_client);
 
 /**
  * ib_get_client_data - Get IB client context
  * @device:Device to get context for
  * @client:Client to get context for
  *
  * ib_get_client_data() returns client context set with
  * ib_set_client_data().
  */
 void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
 {
 	struct ib_client_data *context;
 	void *ret = NULL;
 	unsigned long flags;
 
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry(context, &device->client_data_list, list)
 		if (context->client == client) {
 			ret = context->data;
 			break;
 		}
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
 
 	return ret;
 }
 EXPORT_SYMBOL(ib_get_client_data);
 
 /**
  * ib_set_client_data - Set IB client context
  * @device:Device to set context for
  * @client:Client to set context for
  * @data:Context to set
  *
  * ib_set_client_data() sets client context that can be retrieved with
  * ib_get_client_data().
  */
 void ib_set_client_data(struct ib_device *device, struct ib_client *client,
 			void *data)
 {
 	struct ib_client_data *context;
 	unsigned long flags;
 
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry(context, &device->client_data_list, list)
 		if (context->client == client) {
 			context->data = data;
 			goto out;
 		}
 
 	printk(KERN_WARNING "No client context found for %s/%s\n",
 	       device->name, client->name);
 
 out:
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
 }
 EXPORT_SYMBOL(ib_set_client_data);
 
 /**
  * ib_register_event_handler - Register an IB event handler
  * @event_handler:Handler to register
  *
  * ib_register_event_handler() registers an event handler that will be
  * called back when asynchronous IB events occur (as defined in
  * chapter 11 of the InfiniBand Architecture Specification).  This
  * callback may occur in interrupt context.
  */
 int ib_register_event_handler  (struct ib_event_handler *event_handler)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
 	list_add_tail(&event_handler->list,
 		      &event_handler->device->event_handler_list);
 	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
 
 	return 0;
 }
 EXPORT_SYMBOL(ib_register_event_handler);
 
 /**
  * ib_unregister_event_handler - Unregister an event handler
  * @event_handler:Handler to unregister
  *
  * Unregister an event handler registered with
  * ib_register_event_handler().
  */
 int ib_unregister_event_handler(struct ib_event_handler *event_handler)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
 	list_del(&event_handler->list);
 	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
 
 	return 0;
 }
 EXPORT_SYMBOL(ib_unregister_event_handler);
 
 /**
  * ib_dispatch_event - Dispatch an asynchronous event
  * @event:Event to dispatch
  *
  * Low-level drivers must call ib_dispatch_event() to dispatch the
  * event to all registered event handlers when an asynchronous event
  * occurs.
  */
 void ib_dispatch_event(struct ib_event *event)
 {
 	unsigned long flags;
 	struct ib_event_handler *handler;
 
 	spin_lock_irqsave(&event->device->event_handler_lock, flags);
 
 	list_for_each_entry(handler, &event->device->event_handler_list, list)
 		handler->handler(handler, event);
 
 	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
 }
 EXPORT_SYMBOL(ib_dispatch_event);
 
 /**
  * ib_query_device - Query IB device attributes
  * @device:Device to query
  * @device_attr:Device attributes
  *
  * ib_query_device() returns the attributes of a device through the
  * @device_attr pointer.
  */
 int ib_query_device(struct ib_device *device,
 		    struct ib_device_attr *device_attr)
 {
 	return device->query_device(device, device_attr);
 }
 EXPORT_SYMBOL(ib_query_device);
 
 /**
  * ib_query_port - Query IB port attributes
  * @device:Device to query
  * @port_num:Port number to query
  * @port_attr:Port attributes
  *
  * ib_query_port() returns the attributes of a port through the
  * @port_attr pointer.
  */
 int ib_query_port(struct ib_device *device,
 		  u8 port_num,
 		  struct ib_port_attr *port_attr)
 {
 	if (port_num < start_port(device) || port_num > end_port(device))
 		return -EINVAL;
 
 	return device->query_port(device, port_num, port_attr);
 }
 EXPORT_SYMBOL(ib_query_port);
 
 /**
  * ib_query_gid - Get GID table entry
  * @device:Device to query
  * @port_num:Port number to query
  * @index:GID table index to query
  * @gid:Returned GID
  *
  * ib_query_gid() fetches the specified GID table entry.
  */
 int ib_query_gid(struct ib_device *device,
 		 u8 port_num, int index, union ib_gid *gid)
 {
 	return device->query_gid(device, port_num, index, gid);
 }
 EXPORT_SYMBOL(ib_query_gid);
 
 /**
  * ib_query_pkey - Get P_Key table entry
  * @device:Device to query
  * @port_num:Port number to query
  * @index:P_Key table index to query
  * @pkey:Returned P_Key
  *
  * ib_query_pkey() fetches the specified P_Key table entry.
  */
 int ib_query_pkey(struct ib_device *device,
 		  u8 port_num, u16 index, u16 *pkey)
 {
 	return device->query_pkey(device, port_num, index, pkey);
 }
 EXPORT_SYMBOL(ib_query_pkey);
 
 /**
  * ib_modify_device - Change IB device attributes
  * @device:Device to modify
  * @device_modify_mask:Mask of attributes to change
  * @device_modify:New attribute values
  *
  * ib_modify_device() changes a device's attributes as specified by
  * the @device_modify_mask and @device_modify structure.
  */
 int ib_modify_device(struct ib_device *device,
 		     int device_modify_mask,
 		     struct ib_device_modify *device_modify)
 {
 	if (!device->modify_device)
 		return -ENOSYS;
 
 	return device->modify_device(device, device_modify_mask,
 				     device_modify);
 }
 EXPORT_SYMBOL(ib_modify_device);
 
 /**
  * ib_modify_port - Modifies the attributes for the specified port.
  * @device: The device to modify.
  * @port_num: The number of the port to modify.
  * @port_modify_mask: Mask used to specify which attributes of the port
  *   to change.
  * @port_modify: New attribute values for the port.
  *
  * ib_modify_port() changes a port's attributes as specified by the
  * @port_modify_mask and @port_modify structure.
  */
 int ib_modify_port(struct ib_device *device,
 		   u8 port_num, int port_modify_mask,
 		   struct ib_port_modify *port_modify)
 {
 	if (!device->modify_port)
 		return -ENOSYS;
 
 	if (port_num < start_port(device) || port_num > end_port(device))
 		return -EINVAL;
 
 	return device->modify_port(device, port_num, port_modify_mask,
 				   port_modify);
 }
 EXPORT_SYMBOL(ib_modify_port);
 
 /**
  * ib_find_gid - Returns the port number and GID table index where
  *   a specified GID value occurs.
  * @device: The device to query.
  * @gid: The GID value to search for.
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the GID table where the GID was found.  This
  *   parameter may be NULL.
  */
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 		u8 *port_num, u16 *index)
 {
 	union ib_gid tmp_gid;
 	int ret, port, i;
 
 	for (port = start_port(device); port <= end_port(device); ++port) {
 		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
 			ret = ib_query_gid(device, port, i, &tmp_gid);
 			if (ret)
 				return ret;
 			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
 				*port_num = port;
 				if (index)
 					*index = i;
 				return 0;
 			}
 		}
 	}
 
 	return -ENOENT;
 }
 EXPORT_SYMBOL(ib_find_gid);
 
 /**
  * ib_find_pkey - Returns the PKey table index where a specified
  *   PKey value occurs.
  * @device: The device to query.
  * @port_num: The port number of the device to search for the PKey.
  * @pkey: The PKey value to search for.
  * @index: The index into the PKey table where the PKey was found.
  */
 int ib_find_pkey(struct ib_device *device,
 		 u8 port_num, u16 pkey, u16 *index)
 {
 	int ret, i;
 	u16 tmp_pkey;
 	int partial_ix = -1;
 
 	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
 		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
 		if (ret)
 			return ret;
 		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
 			/* if there is full-member pkey take it.*/
 			if (tmp_pkey & 0x8000) {
 				*index = i;
 				return 0;
 			}
 			if (partial_ix < 0)
 				partial_ix = i;
 		}
 	}
 
 	/*no full-member, if exists take the limited*/
 	if (partial_ix >= 0) {
 		*index = partial_ix;
 		return 0;
 	}
 	return -ENOENT;
 }
 EXPORT_SYMBOL(ib_find_pkey);
 
 static int __init ib_core_init(void)
 {
 	int ret;
 
 	ib_wq = create_workqueue("infiniband");
 	if (!ib_wq)
 		return -ENOMEM;
 
 	ret = ib_sysfs_setup();
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
 		goto err;
 	}
 
 	ret = ib_cache_setup();
 	if (ret) {
 		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
 		goto err_sysfs;
 	}
 
 	return 0;
 
 err_sysfs:
 	ib_sysfs_cleanup();
 
 err:
 	destroy_workqueue(ib_wq);
 	return ret;
 }
 
 static void __exit ib_core_cleanup(void)
 {
 	ib_cache_cleanup();
 	ib_sysfs_cleanup();
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
 }
 
 module_init(ib_core_init);
 module_exit(ib_core_cleanup);
 
 static int
 ibcore_evhand(module_t mod, int event, void *arg)
 {
 	return (0);
 }
 
 static moduledata_t ibcore_mod = {
 	.name = "ibcore",
 	.evhand = ibcore_evhand,
 };
 
 MODULE_VERSION(ibcore, 1);
 MODULE_DEPEND(ibcore, linuxkpi, 1, 1, 1);
-DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_LAST, SI_ORDER_ANY);
Index: head/sys/ofed/drivers/infiniband/hw/mlx4/main.c
===================================================================
--- head/sys/ofed/drivers/infiniband/hw/mlx4/main.c	(revision 296687)
+++ head/sys/ofed/drivers/infiniband/hw/mlx4/main.c	(revision 296688)
@@ -1,2887 +1,2887 @@
 /*
  * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/netdevice.h>
 #include <linux/inetdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/fs.h>
 #include <net/ipv6.h>
 
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_user_verbs_exp.h>
 #include <rdma/ib_addr.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/cmd.h>
 #include <linux/sched.h>
 #include <linux/page.h>
 #include <linux/printk.h>
 #include "mlx4_ib.h"
 #include "mlx4_exp.h"
 #include "user.h"
 #include "wc.h"
 
 #define DRV_NAME	MLX4_IB_DRV_NAME
 #define DRV_VERSION	"1.0"
 #define DRV_RELDATE	__DATE__
 
 #define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib"
 #define MLX4_IB_MRS_PROC_DIR_NAME "mrs"
 #define MLX4_IB_FLOW_MAX_PRIO 0xFFF
 #define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
 MODULE_LICENSE("Dual BSD/GPL");
 #ifdef __linux__
 MODULE_VERSION(DRV_VERSION);
 #endif
 
 int mlx4_ib_sm_guid_assign = 1;
 
 module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
 MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)");
 
 enum {
 	MAX_NUM_STR_BITMAP = 1 << 15,
 	DEFAULT_TBL_VAL = -1
 };
 
 static struct mlx4_dbdf2val_lst dev_assign_str = {
 	.name		= "dev_assign_str param",
 	.num_vals	= 1,
 	.def_val	= {DEFAULT_TBL_VAL},
 	.range		= {0, MAX_NUM_STR_BITMAP - 1}
 };
 module_param_string(dev_assign_str, dev_assign_str.str,
 		    sizeof(dev_assign_str.str), 0444);
 MODULE_PARM_DESC(dev_assign_str,
 		 "Map device function numbers to IB device numbers (e.g. '0000:04:00.0-0,002b:1c:0b.a-1,...').\n"
 		 "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for IB device numbers (e.g. 1).\n"
 		 "\t\tMax supported devices - 32");
 
 
 static unsigned long *dev_num_str_bitmap;
 static spinlock_t dev_num_str_lock;
 
 static const char mlx4_ib_version[] =
 	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
 
 struct update_gid_work {
 	struct work_struct	work;
 	union ib_gid		gids[128];
 	struct mlx4_ib_dev     *dev;
 	int			port;
 };
 
 struct dev_rec {
 	int	bus;
 	int	dev;
 	int	func;
 	int	nr;
 };
 
 static int dr_active;
 
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
 
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*,
 				 unsigned long);
 
 static u8 mlx4_ib_get_dev_port(struct net_device *dev,
                                         struct mlx4_ib_dev *ibdev);
 
 static struct workqueue_struct *wq;
 
 static void init_query_mad(struct ib_smp *mad)
 {
 	mad->base_version  = 1;
 	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
 	mad->class_version = 1;
 	mad->method	   = IB_MGMT_METHOD_GET;
 }
 
 static union ib_gid zgid;
 
 static int check_flow_steering_support(struct mlx4_dev *dev)
 {
 	int eth_num_ports = 0;
 	int ib_num_ports = 0;
 	int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED;
 
 	if (dmfs) {
 		int i;
 		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
 			eth_num_ports++;
 		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
 			ib_num_ports++;
 		dmfs &= (!ib_num_ports ||
 			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) &&
 			(!eth_num_ports ||
 			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN));
 		if (ib_num_ports && mlx4_is_mfunc(dev)) {
 			dmfs = 0;
 		}
 	}
 	return dmfs;
 }
 
 int mlx4_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int err = -ENOMEM;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
 	if (!in_mad || !out_mad)
 		goto out;
 
 	init_query_mad(in_mad);
 	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
 
 	err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
 			   1, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
 	memset(props, 0, sizeof *props);
 
 	props->fw_ver = dev->dev->caps.fw_ver;
 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
 		IB_DEVICE_PORT_ACTIVE_EVENT		|
 		IB_DEVICE_SYS_IMAGE_GUID		|
 		IB_DEVICE_RC_RNR_NAK_GEN		|
 		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK	|
 		IB_DEVICE_SHARED_MR;
 
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
 		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
 		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
 		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
 		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
 	if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)
 		props->device_cap_flags |= IB_DEVICE_UD_TSO;
 	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
 		props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
 	if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
 	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
 	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
 		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
 		props->device_cap_flags |= IB_DEVICE_XRC;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)
 		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
 
 	if (check_flow_steering_support(dev->dev))
 		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
 
 
 	props->device_cap_flags |= IB_DEVICE_QPG;
 	if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
 		props->device_cap_flags |= IB_DEVICE_UD_RSS;
 		props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz;
 	}
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)
 		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW;
 	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
 		if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B)
 			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
 		else
 			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
 	}
 	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
 		0xffffff;
 	props->vendor_part_id	   = dev->dev->pdev->device;
 	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
 
 	props->max_mr_size	   = ~0ull;
 	props->page_size_cap	   = dev->dev->caps.page_size_cap;
 	props->max_qp		   = dev->dev->quotas.qp;
 	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
 	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
 					 dev->dev->caps.max_rq_sg);
 	props->max_cq		   = dev->dev->quotas.cq;
 	props->max_cqe		   = dev->dev->caps.max_cqes;
 	props->max_mr		   = dev->dev->quotas.mpt;
 	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
 	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;
 	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
 	props->max_srq		   = dev->dev->quotas.srq;
 	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;
 	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
 	props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES;
 	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
 	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
 		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
 	props->masked_atomic_cap   = props->atomic_cap;
 	props->max_pkeys	   = dev->dev->caps.pkey_table_len[1];
 	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
 	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 					   props->max_mcast_grp;
 	props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
 	props->hca_core_clock = dev->dev->caps.hca_core_clock;
 	if (dev->dev->caps.hca_core_clock > 0)
 		props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK;
 	if (dev->dev->caps.cq_timestamp) {
 		props->timestamp_mask = 0xFFFFFFFFFFFF;
 		props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK;
 	}
 
 out:
 	kfree(in_mad);
 	kfree(out_mad);
 
 	return err;
 }
 
 static enum rdma_link_layer
 mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
 {
 	struct mlx4_dev *dev = to_mdev(device)->dev;
 
 	return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
 		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
 }
 
 static int ib_link_query_port(struct ib_device *ibdev, u8 port,
 			      struct ib_port_attr *props, int netw_view)
 {
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int ext_active_speed;
 	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
 	int err = -ENOMEM;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
 	if (!in_mad || !out_mad)
 		goto out;
 
 	init_query_mad(in_mad);
 	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
 	in_mad->attr_mod = cpu_to_be32(port);
 
 	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
 		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
 
 	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
 				in_mad, out_mad);
 	if (err)
 		goto out;
 
 
 	props->lid		= be16_to_cpup((__be16 *) (out_mad->data + 16));
 	props->lmc		= out_mad->data[34] & 0x7;
 	props->sm_lid		= be16_to_cpup((__be16 *) (out_mad->data + 18));
 	props->sm_sl		= out_mad->data[36] & 0xf;
 	props->state		= out_mad->data[32] & 0xf;
 	props->phys_state	= out_mad->data[33] >> 4;
 	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
 	if (netw_view)
 		props->gid_tbl_len = out_mad->data[50];
 	else
 		props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
 	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
 	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len[port];
 	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
 	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
 	props->active_width	= out_mad->data[31] & 0xf;
 	props->active_speed	= out_mad->data[35] >> 4;
 	props->max_mtu		= out_mad->data[41] & 0xf;
 	props->active_mtu	= out_mad->data[36] >> 4;
 	props->subnet_timeout	= out_mad->data[51] & 0x1f;
 	props->max_vl_num	= out_mad->data[37] >> 4;
 	props->init_type_reply	= out_mad->data[41] >> 4;
 
 	/* Check if extended speeds (EDR/FDR/...) are supported */
 	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
 		ext_active_speed = out_mad->data[62] >> 4;
 
 		switch (ext_active_speed) {
 		case 1:
 			props->active_speed = IB_SPEED_FDR;
 			break;
 		case 2:
 			props->active_speed = IB_SPEED_EDR;
 			break;
 		}
 	}
 
 	/* If reported active speed is QDR, check if is FDR-10 */
 	if (props->active_speed == IB_SPEED_QDR) {
 		init_query_mad(in_mad);
 		in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO;
 		in_mad->attr_mod = cpu_to_be32(port);
 
 		err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port,
 				   NULL, NULL, in_mad, out_mad);
 		if (err)
 			goto out;
 
 		/* Checking LinkSpeedActive for FDR-10 */
 		if (out_mad->data[15] & 0x1)
 			props->active_speed = IB_SPEED_FDR10;
 	}
 
 	/* Avoid wrong speed value returned by FW if the IB link is down. */
 	if (props->state == IB_PORT_DOWN)
 		 props->active_speed = IB_SPEED_SDR;
 
 out:
 	kfree(in_mad);
 	kfree(out_mad);
 	return err;
 }
 
 static u8 state_to_phys_state(enum ib_port_state state)
 {
 	return state == IB_PORT_ACTIVE ? 5 : 3;
 }
 
 static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 			       struct ib_port_attr *props, int netw_view)
 {
 
 	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
 	struct mlx4_ib_iboe *iboe = &mdev->iboe;
 	struct net_device *ndev;
 	enum ib_mtu tmp;
 	struct mlx4_cmd_mailbox *mailbox;
 	unsigned long flags;
 	int err = 0;
 
 	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
 	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
 			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
 			   MLX4_CMD_WRAPPED);
 	if (err)
 		goto out;
 
 	props->active_width	=  (((u8 *)mailbox->buf)[5] == 0x40) ?
 						IB_WIDTH_4X : IB_WIDTH_1X;
 	props->active_speed	= IB_SPEED_QDR;
 	props->port_cap_flags	= IB_PORT_CM_SUP;
 	if (netw_view)
 		props->gid_tbl_len = MLX4_ROCE_MAX_GIDS;
 	else
 		props->gid_tbl_len   = mdev->dev->caps.gid_table_len[port];
 
 	props->max_msg_sz	= mdev->dev->caps.max_msg_sz;
 	props->pkey_tbl_len	= 1;
 	props->max_mtu		= IB_MTU_4096;
 	props->max_vl_num	= 2;
 	props->state		= IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 	props->active_mtu	= IB_MTU_256;
 	spin_lock_irqsave(&iboe->lock, flags);
 	ndev = iboe->netdevs[port - 1];
 	if (!ndev)
 		goto out_unlock;
 
 	tmp = iboe_get_mtu(ndev->if_mtu);
 	props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
 
 	props->state		= (netif_running(ndev) && netif_carrier_ok(ndev)) ?
 					IB_PORT_ACTIVE : IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 out_unlock:
 	spin_unlock_irqrestore(&iboe->lock, flags);
 out:
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
 	return err;
 }
 
 int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
 			 struct ib_port_attr *props, int netw_view)
 {
 	int err;
 
 	memset(props, 0, sizeof *props);
 
 	err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
 		ib_link_query_port(ibdev, port, props, netw_view) :
 				eth_link_query_port(ibdev, port, props, netw_view);
 
 	return err;
 }
 
 static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
 			      struct ib_port_attr *props)
 {
 	/* returns host view */
 	return __mlx4_ib_query_port(ibdev, port, props, 0);
 }
 
 int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 			union ib_gid *gid, int netw_view)
 {
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int err = -ENOMEM;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	int clear = 0;
 	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
 	if (!in_mad || !out_mad)
 		goto out;
 
 	init_query_mad(in_mad);
 	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
 	in_mad->attr_mod = cpu_to_be32(port);
 
 	if (mlx4_is_mfunc(dev->dev) && netw_view)
 		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
 
 	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
 	memcpy(gid->raw, out_mad->data + 8, 8);
 
 	if (mlx4_is_mfunc(dev->dev) && !netw_view) {
 		if (index) {
 			/* For any index > 0, return the null guid */
 			err = 0;
 			clear = 1;
 			goto out;
 		}
 	}
 
 	init_query_mad(in_mad);
 	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
 	in_mad->attr_mod = cpu_to_be32(index / 8);
 
 	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port,
 			   NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
 	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
 
 out:
 	if (clear)
 		memset(gid->raw + 8, 0, 8);
 	kfree(in_mad);
 	kfree(out_mad);
 	return err;
 }
 
 static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
 			  union ib_gid *gid)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 
 	*gid = dev->iboe.gid_table[port - 1][index];
 
 	return 0;
 }
 
 static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 			     union ib_gid *gid)
 {
 	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
 		return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
 	else
 		return iboe_query_gid(ibdev, port, index, gid);
 }
 
 int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 			 u16 *pkey, int netw_view)
 {
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
 	int err = -ENOMEM;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
 	if (!in_mad || !out_mad)
 		goto out;
 
 	init_query_mad(in_mad);
 	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
 	in_mad->attr_mod = cpu_to_be32(index / 32);
 
 	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
 		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
 
 	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
 			   in_mad, out_mad);
 	if (err)
 		goto out;
 
 	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
 
 out:
 	kfree(in_mad);
 	kfree(out_mad);
 	return err;
 }
 
 static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
 {
 	return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
 }
 
 static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
 				 struct ib_device_modify *props)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	unsigned long flags;
 
 	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
 		return -EOPNOTSUPP;
 
 	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
 		return 0;
 
 	if (mlx4_is_slave(to_mdev(ibdev)->dev))
 		return -EOPNOTSUPP;
 
 	spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
 	memcpy(ibdev->node_desc, props->node_desc, 64);
 	spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
 
 	/*
 	 * If possible, pass node desc to FW, so it can generate
 	 * a 144 trap.  If cmd fails, just ignore.
 	 */
 	mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
 	if (IS_ERR(mailbox))
 		return 0;
 
 	memset(mailbox->buf, 0, 256);
 	memcpy(mailbox->buf, props->node_desc, 64);
 	mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
 		 MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
 
 	return 0;
 }
 
 static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
 			 u32 cap_mask)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	int err;
 	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
 	memset(mailbox->buf, 0, 256);
 
 	if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
 		*(u8 *) mailbox->buf	     = !!reset_qkey_viols << 6;
 		((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
 	} else {
 		((u8 *) mailbox->buf)[3]     = !!reset_qkey_viols;
 		((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
 	}
 
 	err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
 		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
 
 	mlx4_free_cmd_mailbox(dev->dev, mailbox);
 	return err;
 }
 
 static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
 			       struct ib_port_modify *props)
 {
 	struct ib_port_attr attr;
 	u32 cap_mask;
 	int err;
 
 	mutex_lock(&to_mdev(ibdev)->cap_mask_mutex);
 
 	err = mlx4_ib_query_port(ibdev, port, &attr);
 	if (err)
 		goto out;
 
 	cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
 		~props->clr_port_cap_mask;
 
 	err = mlx4_SET_PORT(to_mdev(ibdev), port,
 			    !!(mask & IB_PORT_RESET_QKEY_CNTR),
 			    cap_mask);
 
 out:
 	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
 	return err;
 }
 
 static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
 						  struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	struct mlx4_ib_ucontext *context;
 	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
 	struct mlx4_ib_alloc_ucontext_resp resp;
 	int err;
 
 	if (!dev->ib_active)
 		return ERR_PTR(-EAGAIN);
 
 	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
 		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
 		if (mlx4_wc_enabled()) {
 			resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
 			resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
 		} else {
 			resp_v3.bf_reg_size      = 0;
 			resp_v3.bf_regs_per_page = 0;
 		}
 	} else {
 		resp.dev_caps	      = dev->dev->caps.userspace_caps;
 		resp.qp_tab_size      = dev->dev->caps.num_qps;
 		if (mlx4_wc_enabled()) {
 			resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
 			resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
 		} else {
 			resp.bf_reg_size      = 0;
 			resp.bf_regs_per_page = 0;
 		}
 		resp.cqe_size	      = dev->dev->caps.cqe_size;
 	}
 
 	context = kmalloc(sizeof *context, GFP_KERNEL);
 	if (!context)
 		return ERR_PTR(-ENOMEM);
 
 	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
 	if (err) {
 		kfree(context);
 		return ERR_PTR(err);
 	}
 
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
 	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
 		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
 	else
 		err = ib_copy_to_udata(udata, &resp, sizeof(resp));
 
 	if (err) {
 		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
 		kfree(context);
 		return ERR_PTR(-EFAULT);
 	}
 
 	return &context->ibucontext;
 }
 
 static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
 
 	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
 	kfree(context);
 
 	return 0;
 }
 
 /* XXX FBSD has no support for get_unmapped_area function */
 #if 0
 static unsigned long mlx4_ib_get_unmapped_area(struct file *file,
 			unsigned long addr,
 			unsigned long len, unsigned long pgoff,
 			unsigned long flags)
 {
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
 	unsigned long page_size_order;
 	unsigned long  command;
 
 	mm = current->mm;
 	if (addr)
 		return current->mm->get_unmapped_area(file, addr, len,
 						pgoff, flags);
 
 	/* Last 8 bits hold the  command others are data per that command */
 	command = pgoff & MLX4_IB_MMAP_CMD_MASK;
 	if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES)
 		return current->mm->get_unmapped_area(file, addr, len,
 						pgoff, flags);
 
 	page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS;
 	/* code is based on the huge-pages get_unmapped_area code */
 	start_addr = mm->free_area_cache;
 
 	if (len <= mm->cached_hole_size)
 		start_addr = TASK_UNMAPPED_BASE;
 
 
 full_search:
 	addr = ALIGN(start_addr, 1 << page_size_order);
 
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr) {
 			/*
 			 * Start a new search - just in case we missed
 			 * some holes.
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = TASK_UNMAPPED_BASE;
 				goto full_search;
 			}
 			return -ENOMEM;
 		}
 
 		if (!vma || addr + len <= vma->vm_start)
 			return addr;
 		addr = ALIGN(vma->vm_end, 1 << page_size_order);
 	}
 }
 #endif
 
 static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
 	struct mlx4_ib_dev *dev = to_mdev(context->device);
 
 	/* Last 8 bits hold the  command others are data per that command */
 	unsigned long  command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK;
 
 	if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) {
 		/* compatability handling for commands 0 & 1*/
 		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
 			return -EINVAL;
 	}
 	if (command == MLX4_IB_MMAP_UAR_PAGE) {
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
 				       to_mucontext(context)->uar.pfn,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
 	} else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE &&
 			dev->dev->caps.bf_reg_size != 0) {
 		vma->vm_page_prot = pgprot_wc(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
 				       to_mucontext(context)->uar.pfn +
 				       dev->dev->caps.num_uars,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
 	} else if (command == MLX4_IB_MMAP_GET_HW_CLOCK) {
 		struct mlx4_clock_params params;
 		int ret;
 
 		ret = mlx4_get_internal_clock_params(dev->dev, &params);
 		if (ret)
 			return ret;
 
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 		if (io_remap_pfn_range(vma, vma->vm_start,
 				       (pci_resource_start(dev->dev->pdev,
 				       params.bar) + params.offset)
 				       >> PAGE_SHIFT,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
 	} else
 		return -EINVAL;
 
 	return 0;
 }
 
 static int mlx4_ib_ioctl(struct ib_ucontext *context, unsigned int cmd,
 			 unsigned long arg)
 {
 	struct mlx4_ib_dev *dev = to_mdev(context->device);
 	int ret;
         int offset;
 
 	switch (cmd) {
 	case MLX4_IOCHWCLOCKOFFSET: {
 		struct mlx4_clock_params params;
 		int ret;
 		ret = mlx4_get_internal_clock_params(dev->dev, &params);
 		if (!ret) {
                         offset = params.offset % PAGE_SIZE;
 			ret = put_user(offset,
 					 (int *)arg);
 			return sizeof(int);
 		} else {
 			return ret;
 		}
 	}
 	default: {
 		pr_err("mlx4_ib: invalid ioctl %u command with arg %lX\n",
 		       cmd, arg);
 		return -ENOTTY;
 	}
 	}
 
 	return ret;
 }
 
 static int mlx4_ib_query_values(struct ib_device *device, int q_values,
 				struct ib_device_values *values)
 {
 	struct mlx4_ib_dev *dev = to_mdev(device);
 	cycle_t cycles;
 
 	values->values_mask = 0;
 	if (q_values & IBV_VALUES_HW_CLOCK) {
 		cycles = mlx4_read_clock(dev->dev);
 		if (cycles < 0) {
 			values->hwclock = cycles & CORE_CLOCK_MASK;
 			values->values_mask |= IBV_VALUES_HW_CLOCK;
 		}
 		q_values &= ~IBV_VALUES_HW_CLOCK;
 	}
 
 	if (q_values)
 		return -ENOTTY;
 
 	return 0;
 }
 
 static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
 				      struct ib_ucontext *context,
 				      struct ib_udata *udata)
 {
 	struct mlx4_ib_pd *pd;
 	int err;
 
 	pd = kmalloc(sizeof *pd, GFP_KERNEL);
 	if (!pd)
 		return ERR_PTR(-ENOMEM);
 
 	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
 	if (err) {
 		kfree(pd);
 		return ERR_PTR(err);
 	}
 
 	if (context)
 		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
 			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
 			kfree(pd);
 			return ERR_PTR(-EFAULT);
 		}
 
 	return &pd->ibpd;
 }
 
 static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
 {
 	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
 	kfree(pd);
 
 	return 0;
 }
 
 static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
 					  struct ib_ucontext *context,
 					  struct ib_udata *udata)
 {
 	struct mlx4_ib_xrcd *xrcd;
 	int err;
 
 	if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
 		return ERR_PTR(-ENOSYS);
 
 	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
 	if (!xrcd)
 		return ERR_PTR(-ENOMEM);
 
 	err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn);
 	if (err)
 		goto err1;
 
 	xrcd->pd = ib_alloc_pd(ibdev);
 	if (IS_ERR(xrcd->pd)) {
 		err = PTR_ERR(xrcd->pd);
 		goto err2;
 	}
 
 	xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0);
 	if (IS_ERR(xrcd->cq)) {
 		err = PTR_ERR(xrcd->cq);
 		goto err3;
 	}
 
 	return &xrcd->ibxrcd;
 
 err3:
 	ib_dealloc_pd(xrcd->pd);
 err2:
 	mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn);
 err1:
 	kfree(xrcd);
 	return ERR_PTR(err);
 }
 
 static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 {
 	ib_destroy_cq(to_mxrcd(xrcd)->cq);
 	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
 	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
 	kfree(xrcd);
 
 	return 0;
 }
 
 static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
 {
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_ib_gid_entry *ge;
 
 	ge = kzalloc(sizeof *ge, GFP_KERNEL);
 	if (!ge)
 		return -ENOMEM;
 
 	ge->gid = *gid;
 	if (mlx4_ib_add_mc(mdev, mqp, gid)) {
 		ge->port = mqp->port;
 		ge->added = 1;
 	}
 
 	mutex_lock(&mqp->mutex);
 	list_add_tail(&ge->list, &mqp->gid_list);
 	mutex_unlock(&mqp->mutex);
 
 	return 0;
 }
 
 int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
 		   union ib_gid *gid)
 {
 	u8 mac[6];
 	struct net_device *ndev;
 	int ret = 0;
 
 	if (!mqp->port)
 		return 0;
 
 	spin_lock(&mdev->iboe.lock);
 	ndev = mdev->iboe.netdevs[mqp->port - 1];
 	if (ndev)
 		dev_hold(ndev);
 	spin_unlock(&mdev->iboe.lock);
 
 	if (ndev) {
 		rdma_get_mcast_mac((struct in6_addr *)gid, mac);
 		rtnl_lock();
 		dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0);
 		ret = 1;
 		rtnl_unlock();
 		dev_put(ndev);
 	}
 
 	return ret;
 }
 
 struct mlx4_ib_steering {
 	struct list_head list;
 	u64 reg_id;
 	union ib_gid gid;
 };
 
 static int parse_flow_attr(struct mlx4_dev *dev,
 			   union ib_flow_spec *ib_spec,
 			   struct _rule_hw *mlx4_spec)
 {
 	enum mlx4_net_trans_rule_id type;
 
 	switch (ib_spec->type) {
 	case IB_FLOW_SPEC_ETH:
 		type = MLX4_NET_TRANS_RULE_ID_ETH;
 		memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac,
 		       ETH_ALEN);
 		memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac,
 		       ETH_ALEN);
 		mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
 		mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
 		break;
 
 	case IB_FLOW_SPEC_IB:
 		type = MLX4_NET_TRANS_RULE_ID_IB;
 		mlx4_spec->ib.l3_qpn = ib_spec->ib.val.l3_type_qpn;
 		mlx4_spec->ib.qpn_mask = ib_spec->ib.mask.l3_type_qpn;
 		memcpy(&mlx4_spec->ib.dst_gid, ib_spec->ib.val.dst_gid, 16);
 		memcpy(&mlx4_spec->ib.dst_gid_msk,
 		       ib_spec->ib.mask.dst_gid, 16);
 		break;
 
 	case IB_FLOW_SPEC_IPV4:
 		type = MLX4_NET_TRANS_RULE_ID_IPV4;
 		mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
 		mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip;
 		mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip;
 		mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip;
 		break;
 
 	case IB_FLOW_SPEC_TCP:
 	case IB_FLOW_SPEC_UDP:
 		type = ib_spec->type == IB_FLOW_SPEC_TCP ?
 					MLX4_NET_TRANS_RULE_ID_TCP :
 					MLX4_NET_TRANS_RULE_ID_UDP;
 		mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
 		mlx4_spec->tcp_udp.dst_port_msk =
 			ib_spec->tcp_udp.mask.dst_port;
 		mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
 		mlx4_spec->tcp_udp.src_port_msk =
 			ib_spec->tcp_udp.mask.src_port;
 		break;
 
 	default:
 		return -EINVAL;
 	}
 	if (map_sw_to_hw_steering_id(dev, type) < 0 ||
 	    hw_rule_sz(dev, type) < 0)
 		return -EINVAL;
 	mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type));
 	mlx4_spec->size = hw_rule_sz(dev, type) >> 2;
 	return hw_rule_sz(dev, type);
 }
 
 static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
 			  int domain,
 			  enum mlx4_net_trans_promisc_mode flow_type,
 			  u64 *reg_id)
 {
 	int ret, i;
 	int size = 0;
 	void *ib_flow;
 	struct mlx4_ib_dev *mdev = to_mdev(qp->device);
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
 	size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) +
 			   (sizeof(struct _rule_hw) * flow_attr->num_of_specs);
 
 	static const u16 __mlx4_domain[] = {
 		[IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
 		[IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
 		[IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
 		[IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
 	};
 
 	if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
 		pr_err("Invalid priority value.\n");
 		return -EINVAL;
 	}
 	if (domain >= IB_FLOW_DOMAIN_NUM) {
 		pr_err("Invalid domain value.\n");
 		return -EINVAL;
 	}
 	if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
 		return -EINVAL;
 
 	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 	memset(mailbox->buf, 0, rule_size);
 	ctrl = mailbox->buf;
 
 	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
 				 flow_attr->priority);
 	ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type);
 	ctrl->port = flow_attr->port;
 	ctrl->qpn = cpu_to_be32(qp->qp_num);
 
 	if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK)
 		ctrl->flags = (1 << 3);
 
 	ib_flow = flow_attr + 1;
 	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
 	for (i = 0; i < flow_attr->num_of_specs; i++) {
 		ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size);
 		if (ret < 0) {
 			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
 			return -EINVAL;
 		}
 		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
 		size += ret;
 	}
 
 	ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
 			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
 			   MLX4_CMD_NATIVE);
 	if (ret == -ENOMEM)
 		pr_err("mcg table is full. Fail to register network rule.\n");
 	else if (ret == -ENXIO)
 		pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
 	else if (ret)
 		pr_err("Invalid argumant. Fail to register network rule.\n");
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
 	return ret;
 }
 
 static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
 {
 	int err;
 	err = mlx4_cmd(dev, reg_id, 0, 0,
 		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
 		       MLX4_CMD_NATIVE);
 	if (err)
 		pr_err("Fail to detach network rule. registration id = 0x%llx\n",
 		       (unsigned long long)reg_id);
 	return err;
 }
 
 static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
 				    struct ib_flow_attr *flow_attr,
 				    int domain)
 {
 	int err = 0, i = 0;
 	struct mlx4_ib_flow *mflow;
 	enum mlx4_net_trans_promisc_mode type[2];
 
 	memset(type, 0, sizeof(type));
 
 	mflow = kzalloc(sizeof(struct mlx4_ib_flow), GFP_KERNEL);
 	if (!mflow) {
 		err = -ENOMEM;
 		goto err_free;
 	}
 
 	switch (flow_attr->type) {
 	case IB_FLOW_ATTR_NORMAL:
 		type[0] = MLX4_FS_REGULAR;
 		break;
 
 	case IB_FLOW_ATTR_ALL_DEFAULT:
 		type[0] = MLX4_FS_ALL_DEFAULT;
 		break;
 
 	case IB_FLOW_ATTR_MC_DEFAULT:
 		type[0] = MLX4_FS_MC_DEFAULT;
 		break;
 
 	case IB_FLOW_ATTR_SNIFFER:
 		type[0] = MLX4_FS_UC_SNIFFER;
 		type[1] = MLX4_FS_MC_SNIFFER;
 		break;
 
 	default:
 		err = -EINVAL;
 		goto err_free;
 	}
 
 	while (i < ARRAY_SIZE(type) && type[i]) {
 		err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
 					    &mflow->reg_id[i]);
 		if (err)
 			goto err_free;
 		i++;
 	}
 
 	return &mflow->ibflow;
 
 err_free:
 	kfree(mflow);
 	return ERR_PTR(err);
 }
 
 static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
 {
 	int err, ret = 0;
 	int i = 0;
 	struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
 	struct mlx4_ib_flow *mflow = to_mflow(flow_id);
 
 	while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) {
 		err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]);
 		if (err)
 			ret = err;
 		i++;
 	}
 
 	kfree(mflow);
 	return ret;
 }
 
 static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
 {
 	struct mlx4_ib_gid_entry *ge;
 	struct mlx4_ib_gid_entry *tmp;
 	struct mlx4_ib_gid_entry *ret = NULL;
 
 	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
 		if (!memcmp(raw, ge->gid.raw, 16)) {
 			ret = ge;
 			break;
 		}
 	}
 
 	return ret;
 }
 
 
 static int del_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
 {
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 	struct mlx4_ib_gid_entry *ge;
 	struct net_device *ndev;
 	u8 mac[6];
 
 	mutex_lock(&mqp->mutex);
 	ge = find_gid_entry(mqp, gid->raw);
 	if (ge) {
 		spin_lock(&mdev->iboe.lock);
 		ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
 		if (ndev)
 			dev_hold(ndev);
 		spin_unlock(&mdev->iboe.lock);
 		rdma_get_mcast_mac((struct in6_addr *)gid, mac);
 		if (ndev) {
 			rtnl_lock();
 			dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0);
 			rtnl_unlock();
 			dev_put(ndev);
 		}
 		list_del(&ge->list);
 		kfree(ge);
 	} else
 		pr_warn("could not find mgid entry\n");
 
 	mutex_unlock(&mqp->mutex);
 	return ge != 0 ? 0 : -EINVAL;
 }
 
 static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid,
 			       int count)
 {
 	int err;
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 	u64 reg_id = 0;
 	int record_err = 0;
 
 	if (mdev->dev->caps.steering_mode ==
 	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		struct mlx4_ib_steering *ib_steering;
 		struct mlx4_ib_steering *tmp;
 		LIST_HEAD(temp);
 
 		mutex_lock(&mqp->mutex);
 		list_for_each_entry_safe(ib_steering, tmp, &mqp->steering_rules,
 					 list) {
 			if (memcmp(ib_steering->gid.raw, gid->raw, 16))
 				continue;
 
 			if (--count < 0)
 				break;
 
 			list_del(&ib_steering->list);
 			list_add(&ib_steering->list, &temp);
 		}
 		mutex_unlock(&mqp->mutex);
 		list_for_each_entry_safe(ib_steering, tmp, &temp,
 					 list) {
 			reg_id = ib_steering->reg_id;
 
 			err = mlx4_multicast_detach(mdev->dev, &mqp->mqp,
 					gid->raw,
 					(ibqp->qp_type == IB_QPT_RAW_PACKET) ?
 					MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
 					reg_id);
 			if (err) {
 				record_err = record_err ?: err;
 				continue;
 			}
 
 			err = del_gid_entry(ibqp, gid);
 			if (err) {
 				record_err = record_err ?: err;
 				continue;
 			}
 
 			list_del(&ib_steering->list);
 			kfree(ib_steering);
 		}
 		mutex_lock(&mqp->mutex);
 		list_for_each_entry(ib_steering, &temp, list) {
 			list_add(&ib_steering->list, &mqp->steering_rules);
 		}
 		mutex_unlock(&mqp->mutex);
 		if (count) {
 			pr_warn("Couldn't release all reg_ids for mgid. Steering rule is left attached\n");
 			return -EINVAL;
 		}
 
 	} else {
 		if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 &&
 		    ibqp->qp_type == IB_QPT_RAW_PACKET)
 			gid->raw[5] = mqp->port;
 
 		err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
 				(ibqp->qp_type == IB_QPT_RAW_PACKET) ?
 				MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
 				reg_id);
 		if (err)
 			return err;
 
 		err = del_gid_entry(ibqp, gid);
 
 		if (err)
 			return err;
 	}
 
 	return record_err;
 }
 
 static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	int count = (mdev->dev->caps.steering_mode ==
 		     MLX4_STEERING_MODE_DEVICE_MANAGED) ?
 		    mdev->dev->caps.num_ports : 1;
 
 	return _mlx4_ib_mcg_detach(ibqp, gid, lid, count);
 }
 
 static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	int err = -ENODEV;
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 	DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
 	int i = 0;
 
 	if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 &&
 	    ibqp->qp_type == IB_QPT_RAW_PACKET)
 		gid->raw[5] = mqp->port;
 
 	if (mdev->dev->caps.steering_mode ==
 	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		bitmap_fill(ports, mdev->dev->caps.num_ports);
 	} else {
 		if (mqp->port <= mdev->dev->caps.num_ports) {
 			bitmap_zero(ports, mdev->dev->caps.num_ports);
 			set_bit(0, ports);
 		} else {
 			return -EINVAL;
 		}
 	}
 
 	for (; i < mdev->dev->caps.num_ports; i++) {
 		u64 reg_id;
 		struct mlx4_ib_steering *ib_steering = NULL;
 		if (!test_bit(i, ports))
 			continue;
 		if (mdev->dev->caps.steering_mode ==
 		    MLX4_STEERING_MODE_DEVICE_MANAGED) {
 			ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
 			if (!ib_steering)
 				goto err_add;
 		}
 
 		err = mlx4_multicast_attach(mdev->dev, &mqp->mqp,
 			gid->raw, i + 1,
 			!!(mqp->flags &
 				MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
 			(ibqp->qp_type == IB_QPT_RAW_PACKET) ?
 			MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
 			&reg_id);
 		if (err) {
 			kfree(ib_steering);
 			goto err_add;
 		}
 
 		err = add_gid_entry(ibqp, gid);
 		if (err) {
 			mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
 					      MLX4_PROT_IB_IPV6, reg_id);
 			kfree(ib_steering);
 			goto err_add;
 		}
 
 		if (ib_steering) {
 			memcpy(ib_steering->gid.raw, gid->raw, 16);
 			mutex_lock(&mqp->mutex);
 			list_add(&ib_steering->list, &mqp->steering_rules);
 			mutex_unlock(&mqp->mutex);
 			ib_steering->reg_id = reg_id;
 		}
 	}
 
 
 	return 0;
 
 err_add:
 	if (i > 0)
 		_mlx4_ib_mcg_detach(ibqp, gid, lid, i);
 
 	return err;
 }
 
 static int init_node_data(struct mlx4_ib_dev *dev)
 {
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
 	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
 	int err = -ENOMEM;
 
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
 	if (!in_mad || !out_mad)
 		goto out;
 
 	init_query_mad(in_mad);
 	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
 	if (mlx4_is_master(dev->dev))
 		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
 
 	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
 	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
 
 	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
 
 	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
 	if (err)
 		goto out;
 
 	dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
 
 out:
 	kfree(in_mad);
 	kfree(out_mad);
 	return err;
 }
 
 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
 			char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
 }
 
 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
 		       (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
 		       (int) dev->dev->caps.fw_ver & 0xffff);
 }
 
 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
 			char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	return sprintf(buf, "%x\n", dev->dev->rev_id);
 }
 
 static ssize_t show_board(struct device *device, struct device_attribute *attr,
 			  char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
 		       dev->dev->board_id);
 }
 
 static ssize_t show_vsd(struct device *device, struct device_attribute *attr,
 			  char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	ssize_t len = MLX4_VSD_LEN;
 
 	if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX)
 		len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd);
 	else
 		memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN);
 
 	return len;
 }
 
 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
 static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
 static DEVICE_ATTR(vsd,      S_IRUGO, show_vsd,    NULL);
 
 static struct device_attribute *mlx4_class_attributes[] = {
 	&dev_attr_hw_rev,
 	&dev_attr_fw_ver,
 	&dev_attr_hca_type,
 	&dev_attr_board_id,
 	&dev_attr_vsd
 };
 
 static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port)
 {
         memcpy(eui, IF_LLADDR(dev), 3);
         memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
 	if (vlan_id < 0x1000) {
 		eui[3] = vlan_id >> 8;
 		eui[4] = vlan_id & 0xff;
 	} else {
 		eui[3] = 0xff;
 		eui[4] = 0xfe;
 	}
 	eui[0] ^= 2;
 }
 
 static void update_gids_task(struct work_struct *work)
 {
 	struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
 	struct mlx4_cmd_mailbox *mailbox;
 	union ib_gid *gids;
 	int err;
 	struct mlx4_dev	*dev = gw->dev->dev;
 
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox)) {
 		pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
 		goto free;
 	}
 
 	gids = mailbox->buf;
 	memcpy(gids, gw->gids, sizeof gw->gids);
 
 	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
 					IB_LINK_LAYER_ETHERNET) {
 		err = mlx4_cmd(dev, mailbox->dma,
 			       MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
 			       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
 			       MLX4_CMD_WRAPPED);
 
 		if (err)
 			pr_warn("set port command failed\n");
 		else
 			mlx4_ib_dispatch_event(gw->dev, gw->port,
 					       IB_EVENT_GID_CHANGE);
 	}
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 free:
 	kfree(gw);
 }
 
 static void reset_gids_task(struct work_struct *work)
 {
 	struct update_gid_work *gw =
 			container_of(work, struct update_gid_work, work);
 	struct mlx4_cmd_mailbox *mailbox;
 	union ib_gid *gids;
 	int err;
 	struct mlx4_dev	*dev = gw->dev->dev;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox)) {
 		pr_warn("reset gid table failed\n");
 		goto free;
 	}
 
 	gids = mailbox->buf;
 	memcpy(gids, gw->gids, sizeof(gw->gids));
 
 	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 1) ==
 					IB_LINK_LAYER_ETHERNET &&
 					dev->caps.num_ports > 0) {
 		err = mlx4_cmd(dev, mailbox->dma,
 			       MLX4_SET_PORT_GID_TABLE << 8 | 1,
 			       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
 			       MLX4_CMD_WRAPPED);
 		if (err)
 			pr_warn("set port 1 command failed\n");
 	}
 
 	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 2) ==
 					IB_LINK_LAYER_ETHERNET &&
 					dev->caps.num_ports > 1) {
 		err = mlx4_cmd(dev, mailbox->dma,
 			       MLX4_SET_PORT_GID_TABLE << 8 | 2,
 			       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
 			       MLX4_CMD_WRAPPED);
 		if (err)
 			pr_warn("set port 2 command failed\n");
 	}
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 free:
 	kfree(gw);
 }
 
 static int update_gid_table(struct mlx4_ib_dev *dev, int port,
 		union ib_gid *gid, int clear, int default_gid)
 {
 	struct update_gid_work *work;
 	int i;
 	int need_update = 0;
 	int free = -1;
 	int found = -1;
 	int max_gids;
 	int start_index = !default_gid;
 
 	max_gids = dev->dev->caps.gid_table_len[port];
 	for (i = start_index; i < max_gids; ++i) {
 		if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
 		    sizeof(*gid)))
 			found = i;
 
 		if (clear) {
 			if (found >= 0) {
 				need_update = 1;
 				dev->iboe.gid_table[port - 1][found] = zgid;
 				break;
 			}
 		} else {
 			if (found >= 0)
 				break;
 
 			if (free < 0 &&
 			    !memcmp(&dev->iboe.gid_table[port - 1][i],
 				    &zgid, sizeof(*gid)))
 				free = i;
 		}
 	}
 
 	if (found == -1 && !clear && free < 0) {
 		pr_err("GID table of port %d is full. Can't add "GID_PRINT_FMT"\n",
 		       port, GID_PRINT_ARGS(gid));
 		return -ENOMEM;
 	}
 	if (found == -1 && clear) {
 		pr_err(GID_PRINT_FMT" is not in GID table of port %d\n", GID_PRINT_ARGS(gid), port);
 		return -EINVAL;
 	}
 	if (found == -1 && !clear && free >= 0) {
 		dev->iboe.gid_table[port - 1][free] = *gid;
 		need_update = 1;
 	}
 
 	if (!need_update)
 		return 0;
 
 	work = kzalloc(sizeof *work, GFP_ATOMIC);
 	if (!work)
 		return -ENOMEM;
 
 	memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
 	INIT_WORK(&work->work, update_gids_task);
 	work->port = port;
 	work->dev = dev;
 	queue_work(wq, &work->work);
 
 	return 0;
 }
 
 static int reset_gid_table(struct mlx4_ib_dev *dev)
 {
 	struct update_gid_work *work;
 
 
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work)
 		return -ENOMEM;
 
 	memset(dev->iboe.gid_table, 0, sizeof(dev->iboe.gid_table));
 	memset(work->gids, 0, sizeof(work->gids));
 	INIT_WORK(&work->work, reset_gids_task);
 	work->dev = dev;
 	queue_work(wq, &work->work);
 	return 0;
 }
 
 /* XXX BOND Related - stub (no support for these flags in FBSD)*/
 static inline int netif_is_bond_master(struct net_device *dev)
 {
 #if 0
 	return (dev->flags & IFF_MASTER) && (dev->priv_flags & IFF_BONDING);
 #endif
         return 0;
 }
 
 static void mlx4_make_default_gid(struct  net_device *dev, union ib_gid *gid, u8 port)
 {
 	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
 	mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev, port);
 }
 
 static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev)
 {
 	u8 port = 0;
 	struct mlx4_ib_iboe *iboe;
 	struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
 				rdma_vlan_dev_real_dev(dev) : dev;
 
 	iboe = &ibdev->iboe;
 
 	for (port = 1; port <= MLX4_MAX_PORTS; ++port)
 		if ((netif_is_bond_master(real_dev) && (real_dev == iboe->masters[port - 1])) ||
 		    (!netif_is_bond_master(real_dev) && (real_dev == iboe->netdevs[port - 1])))
 			break;
 
 	return port > MLX4_MAX_PORTS ? 0 : port;
 }
 
 static void mlx4_ib_get_dev_addr(struct net_device *dev, struct mlx4_ib_dev *ibdev, u8 port)
 {
         struct ifaddr *ifa;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct inet6_dev *in6_dev;
 	union ib_gid  *pgid;
 	struct inet6_ifaddr *ifp;
 #endif
 	union ib_gid gid;
 
 
 	if ((port == 0) || (port > MLX4_MAX_PORTS))
 		return;
 
 	/* IPv4 gids */
         TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) {
                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET){
                         ipv6_addr_set_v4mapped(
 				((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr,
 				(struct in6_addr *)&gid);
                         update_gid_table(ibdev, port, &gid, 0, 0);
                 }
 
         }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	/* IPv6 gids */
 	in6_dev = in6_dev_get(dev);
 	if (in6_dev) {
 		read_lock_bh(&in6_dev->lock);
 		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
 			pgid = (union ib_gid *)&ifp->addr;
 				update_gid_table(ibdev, port, pgid, 0, 0);
 			}
 		read_unlock_bh(&in6_dev->lock);
 		in6_dev_put(in6_dev);
 		}
 #endif
 }
 
 static void mlx4_set_default_gid(struct mlx4_ib_dev *ibdev,
 				 struct  net_device *dev, u8 port)
 {
 	union ib_gid gid;
 	mlx4_make_default_gid(dev, &gid, port);
 	update_gid_table(ibdev, port, &gid, 0, 1);
 }
 
 static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
 {
 	struct	net_device *dev;
 
 	if (reset_gid_table(ibdev))
 		return -1;
 
         IFNET_RLOCK_NOSLEEP();
         TAILQ_FOREACH(dev, &V_ifnet, if_link) {
 		u8 port = mlx4_ib_get_dev_port(dev, ibdev);
 		if (port) {
 			if (!rdma_vlan_dev_real_dev(dev) &&
 			    !netif_is_bond_master(dev))
 				mlx4_set_default_gid(ibdev, dev, port);
 			mlx4_ib_get_dev_addr(dev, ibdev, port);
 		}
 	}
 
         IFNET_RUNLOCK_NOSLEEP();
 
 	return 0;
 }
 
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 				 struct net_device *dev, unsigned long event)
 {
 	struct mlx4_ib_iboe *iboe;
 	int port;
 	int init = 0;
 	unsigned long flags;
 
 	iboe = &ibdev->iboe;
 
 	spin_lock_irqsave(&iboe->lock, flags);
 	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
 		struct net_device *old_netdev = iboe->netdevs[port - 1];
 /* XXX BOND related */
 #if 0
 		struct net_device *old_master = iboe->masters[port - 1];
 #endif
 		iboe->masters[port - 1] = NULL;
 		iboe->netdevs[port - 1] =
 			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
 
 
 		if (old_netdev != iboe->netdevs[port - 1])
 			init = 1;
 		if (dev == iboe->netdevs[port - 1] &&
 		    event == NETDEV_CHANGEADDR)
 			init = 1;
 /* XXX BOND related */
 #if 0
                 if (iboe->netdevs[port - 1] && netif_is_bond_slave(iboe->netdevs[port - 1]))
 			iboe->masters[port - 1] = iboe->netdevs[port - 1]->master;
 
 		/* if bonding is used it is possible that we add it to masters only after
 		   IP address is assigned to the net bonding interface */
 		if (old_master != iboe->masters[port - 1])
 			init = 1;
 #endif
 	}
 
 	spin_unlock_irqrestore(&iboe->lock, flags);
 
 	if (init)
 		if (mlx4_ib_init_gid_table(ibdev))
 			pr_warn("Fail to reset gid table\n");
 }
 
 static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event,
 				void *ptr)
 {
 	struct net_device *dev = ptr;
 	struct mlx4_ib_dev *ibdev;
 
 	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
 
 	mlx4_ib_scan_netdevs(ibdev, dev, event);
 
 	return NOTIFY_DONE;
 }
 
 /* This function initializes the gid table only if the event_netdev real device is an iboe
  * device, will be invoked by the inet/inet6 events */
 static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
                                 void *ptr)
 {
         struct net_device *event_netdev = ptr;
         struct mlx4_ib_dev *ibdev;
         struct mlx4_ib_iboe *ibdev_iboe;
         int port = 0;
 
         ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
 
         struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
                         rdma_vlan_dev_real_dev(event_netdev) :
                         event_netdev;
 
         ibdev_iboe = &ibdev->iboe;
 
         port = mlx4_ib_get_dev_port(real_dev, ibdev);
 
         /* Perform init_gid_table if the event real_dev is the net_device which represents this port,
          * otherwise this event is not related and would be ignored.*/
         if(port && (real_dev == ibdev_iboe->netdevs[port - 1]))
                 if (mlx4_ib_init_gid_table(ibdev))
                         pr_warn("Fail to reset gid table\n");
 
         return NOTIFY_DONE;
 }
 
 
 static void init_pkeys(struct mlx4_ib_dev *ibdev)
 {
 	int port;
 	int slave;
 	int i;
 
 	if (mlx4_is_master(ibdev->dev)) {
 		for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
 			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
 				for (i = 0;
 				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
 				     ++i) {
 					ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] =
 					/* master has the identity virt2phys pkey mapping */
 						(slave == mlx4_master_func_num(ibdev->dev) || !i) ? i :
 							ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1;
 					mlx4_sync_pkey_table(ibdev->dev, slave, port, i,
 							     ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]);
 				}
 			}
 		}
 		/* initialize pkey cache */
 		for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
 			for (i = 0;
 			     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
 			     ++i)
 				ibdev->pkeys.phys_pkey_cache[port-1][i] =
 					(i) ? 0 : 0xFFFF;
 		}
 	}
 }
 
 static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 {
 	char name[32];
 	int eq_per_port = 0;
 	int added_eqs = 0;
 	int total_eqs = 0;
 	int i, j, eq;
 
 	/* Legacy mode or comp_pool is not large enough */
 	if (dev->caps.comp_pool == 0 ||
 	    dev->caps.num_ports > dev->caps.comp_pool)
 		return;
 
 	eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/
 					dev->caps.num_ports);
 
 	/* Init eq table */
 	added_eqs = 0;
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
 		added_eqs += eq_per_port;
 
 	total_eqs = dev->caps.num_comp_vectors + added_eqs;
 
 	ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
 	if (!ibdev->eq_table)
 		return;
 
 	ibdev->eq_added = added_eqs;
 
 	eq = 0;
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
 		for (j = 0; j < eq_per_port; j++) {
 			sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j,
 						pci_get_domain(dev->pdev->dev.bsddev),
 						pci_get_bus(dev->pdev->dev.bsddev),
 						PCI_SLOT(dev->pdev->devfn),
 						PCI_FUNC(dev->pdev->devfn));
 
 			/* Set IRQ for specific name (per ring) */
 			if (mlx4_assign_eq(dev, name,
 					   &ibdev->eq_table[eq])) {
 				/* Use legacy (same as mlx4_en driver) */
 				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
 				ibdev->eq_table[eq] =
 					(eq % dev->caps.num_comp_vectors);
 			}
 			eq++;
 		}
 	}
 
 	/* Fill the reset of the vector with legacy EQ */
 	for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
 		ibdev->eq_table[eq++] = i;
 
 	/* Advertise the new number of EQs to clients */
 	ibdev->ib_dev.num_comp_vectors = total_eqs;
 }
 
 static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 {
 	int i;
 
 	/* no additional eqs were added */
 	if (!ibdev->eq_table)
 		return;
 
 	/* Reset the advertised EQ number */
 	ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
 
 	/* Free only the added eqs */
 	for (i = 0; i < ibdev->eq_added; i++) {
 		/* Don't free legacy eqs if used */
 		if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
 			continue;
 		mlx4_release_eq(dev, ibdev->eq_table[i]);
 	}
 
 	kfree(ibdev->eq_table);
 }
 
 /*
  * create show function and a device_attribute struct pointing to
  * the function for _name
  */
 #define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod)		\
 static ssize_t show_rprt_##_name(struct device *dev,		\
 				 struct device_attribute *attr,	\
 				 char *buf){			\
 	return show_diag_rprt(dev, buf, _offset, _op_mod);	\
 }								\
 static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL);
 
 #define MLX4_DIAG_RPRT_CLEAR_DIAGS 3
 
 static size_t show_diag_rprt(struct device *device, char *buf,
 			     u32 offset, u8 op_modifier)
 {
 	size_t ret;
 	u32 counter_offset = offset;
 	u32 diag_counter = 0;
 	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
 					       ib_dev.dev);
 
 	ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier,
 				       &counter_offset, &diag_counter);
 	if (ret)
 		return ret;
 
 	return sprintf(buf, "%d\n", diag_counter);
 }
 
 static ssize_t clear_diag_counters(struct device *device,
 				   struct device_attribute *attr,
 				   const char *buf, size_t length)
 {
 	size_t ret;
 	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
 					       ib_dev.dev);
 
 	ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS,
 				       NULL, NULL);
 	if (ret)
 		return ret;
 
 	return length;
 }
 
 DEVICE_DIAG_RPRT_ATTR(rq_num_lle	, 0x00, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_lle	, 0x04, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe	, 0x08, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe 	, 0x0C, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_lpe	, 0x18, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_lpe	, 0x1C, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe	, 0x20, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe	, 0x24, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe	, 0x2C, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_bre	, 0x34, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_lae	, 0x38, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_rire	, 0x44, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_rire	, 0x48, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_rae	, 0x4C, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_rae	, 0x50, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_roe	, 0x54, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_tree	, 0x5C, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_rree	, 0x64, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_rnr	, 0x68, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_rnr	, 0x6C, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_oos	, 0x100, 2);
 DEVICE_DIAG_RPRT_ATTR(sq_num_oos	, 0x104, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_mce	, 0x108, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd	, 0x118, 2);
 DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd	, 0x120, 2);
 DEVICE_DIAG_RPRT_ATTR(num_cqovf		, 0x1A0, 2);
 DEVICE_DIAG_RPRT_ATTR(num_eqovf		, 0x1A4, 2);
 DEVICE_DIAG_RPRT_ATTR(num_baddb		, 0x1A8, 2);
 
 static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters);
 
 static struct attribute *diag_rprt_attrs[] = {
 	&dev_attr_rq_num_lle.attr,
 	&dev_attr_sq_num_lle.attr,
 	&dev_attr_rq_num_lqpoe.attr,
 	&dev_attr_sq_num_lqpoe.attr,
 	&dev_attr_rq_num_lpe.attr,
 	&dev_attr_sq_num_lpe.attr,
 	&dev_attr_rq_num_wrfe.attr,
 	&dev_attr_sq_num_wrfe.attr,
 	&dev_attr_sq_num_mwbe.attr,
 	&dev_attr_sq_num_bre.attr,
 	&dev_attr_rq_num_lae.attr,
 	&dev_attr_sq_num_rire.attr,
 	&dev_attr_rq_num_rire.attr,
 	&dev_attr_sq_num_rae.attr,
 	&dev_attr_rq_num_rae.attr,
 	&dev_attr_sq_num_roe.attr,
 	&dev_attr_sq_num_tree.attr,
 	&dev_attr_sq_num_rree.attr,
 	&dev_attr_rq_num_rnr.attr,
 	&dev_attr_sq_num_rnr.attr,
 	&dev_attr_rq_num_oos.attr,
 	&dev_attr_sq_num_oos.attr,
 	&dev_attr_rq_num_mce.attr,
 	&dev_attr_rq_num_udsdprd.attr,
 	&dev_attr_rq_num_ucsdprd.attr,
 	&dev_attr_num_cqovf.attr,
 	&dev_attr_num_eqovf.attr,
 	&dev_attr_num_baddb.attr,
 	&dev_attr_clear_diag.attr,
 	NULL
 };
 
 static struct attribute_group diag_counters_group = {
 	.name  = "diag_counters",
 	.attrs  = diag_rprt_attrs
 };
 
 static void init_dev_assign(void)
 {
 	int i = 1;
 
 	spin_lock_init(&dev_num_str_lock);
 	if (mlx4_fill_dbdf2val_tbl(&dev_assign_str))
 		return;
 	dev_num_str_bitmap =
 		kmalloc(BITS_TO_LONGS(MAX_NUM_STR_BITMAP) * sizeof(long),
 			GFP_KERNEL);
 	if (!dev_num_str_bitmap) {
 		pr_warn("bitmap alloc failed -- cannot apply dev_assign_str parameter\n");
 		return;
 	}
 	bitmap_zero(dev_num_str_bitmap, MAX_NUM_STR_BITMAP);
 	while ((i < MLX4_DEVS_TBL_SIZE) && (dev_assign_str.tbl[i].dbdf !=
 	       MLX4_ENDOF_TBL)) {
 		if (bitmap_allocate_region(dev_num_str_bitmap,
 					   dev_assign_str.tbl[i].val[0], 0))
 			goto err;
 		i++;
 	}
 	dr_active = 1;
 	return;
 
 err:
 	kfree(dev_num_str_bitmap);
 	dev_num_str_bitmap = NULL;
 	pr_warn("mlx4_ib: The value of 'dev_assign_str' parameter "
 			    "is incorrect. The parameter value is discarded!");
 }
 
 static int mlx4_ib_dev_idx(struct mlx4_dev *dev)
 {
 	int i, val;
 
 	if (!dr_active)
 		return -1;
 	if (!dev)
 		return -1;
 	if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val))
 		return -1;
 
 	if (val != DEFAULT_TBL_VAL) {
 		dev->flags |= MLX4_FLAG_DEV_NUM_STR;
 		return val;
 	}
 
 	spin_lock(&dev_num_str_lock);
 	i = bitmap_find_free_region(dev_num_str_bitmap, MAX_NUM_STR_BITMAP, 0);
 	spin_unlock(&dev_num_str_lock);
 	if (i >= 0)
 		return i;
 
 	return -1;
 }
 
 static void *mlx4_ib_add(struct mlx4_dev *dev)
 {
 	struct mlx4_ib_dev *ibdev;
 	int num_ports = 0;
 	int i, j;
 	int err;
 	struct mlx4_ib_iboe *iboe;
 	int dev_idx;
 
         pr_info_once("%s", mlx4_ib_version);
 
 	mlx4_foreach_ib_transport_port(i, dev)
 		num_ports++;
 
 	/* No point in registering a device with no ports... */
 	if (num_ports == 0)
 		return NULL;
 
 	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
 	if (!ibdev) {
 		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
 		return NULL;
 	}
 
 	iboe = &ibdev->iboe;
 
 	if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
 		goto err_dealloc;
 
 	if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
 		goto err_pd;
 
 	ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT,
 		PAGE_SIZE);
 
 	if (!ibdev->priv_uar.map)
 		goto err_uar;
 
 	MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
 
 	ibdev->dev = dev;
 
 	dev_idx = mlx4_ib_dev_idx(dev);
 	if (dev_idx >= 0)
 		sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx);
 	else
 		strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
 
 	ibdev->ib_dev.owner		= THIS_MODULE;
 	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
 	ibdev->num_ports		= num_ports;
 	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
 	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
 
 	if (dev->caps.userspace_caps)
 		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
 	else
 		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
 
 	ibdev->ib_dev.uverbs_cmd_mask	=
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
 		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
 		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
 		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
 		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
 		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
 		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
 		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
 		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
 		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
 		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
 
 	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
 	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
 	ibdev->ib_dev.get_link_layer	= mlx4_ib_port_link_layer;
 	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
 	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
 	ibdev->ib_dev.modify_device	= mlx4_ib_modify_device;
 	ibdev->ib_dev.modify_port	= mlx4_ib_modify_port;
 	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
 	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
 	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
 /* XXX FBSD has no support for get_unmapped_area function */
 #if 0
 	ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area;
 #endif
 	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
 	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
 	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
 	ibdev->ib_dev.query_ah		= mlx4_ib_query_ah;
 	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
 	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
 	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
 	ibdev->ib_dev.query_srq		= mlx4_ib_query_srq;
 	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
 	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
 	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
 	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
 	ibdev->ib_dev.query_qp		= mlx4_ib_query_qp;
 	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
 	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
 	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
 	ibdev->ib_dev.create_cq		= mlx4_ib_create_cq;
 	ibdev->ib_dev.modify_cq		= mlx4_ib_modify_cq;
 	ibdev->ib_dev.resize_cq		= mlx4_ib_resize_cq;
 	ibdev->ib_dev.destroy_cq	= mlx4_ib_destroy_cq;
 	ibdev->ib_dev.poll_cq		= mlx4_ib_poll_cq;
 	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
 	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
 	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
 	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
 	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
 	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
 	ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
 	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
 	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
 	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
 	ibdev->ib_dev.ioctl		= mlx4_ib_ioctl;
 	ibdev->ib_dev.query_values	= mlx4_ib_query_values;
 
 	if (!mlx4_is_slave(ibdev->dev)) {
 		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
 		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
 		ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
 		ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
 	}
 
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) {
 		ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
 		ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
 		ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
 
 		ibdev->ib_dev.uverbs_cmd_mask |=
 			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
 			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
 	}
 
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
 		ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
 		ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
 		ibdev->ib_dev.uverbs_cmd_mask |=
 			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
 	}
 
 	/*
 	 * Set experimental data
 	 */
 	ibdev->ib_dev.uverbs_exp_cmd_mask	=
 		(1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP)	|
 		(1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ)	|
 		(1ull << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE)	|
 		(1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ);
 	ibdev->ib_dev.exp_create_qp	= mlx4_ib_exp_create_qp;
 	ibdev->ib_dev.exp_query_device	= mlx4_ib_exp_query_device;
 	if (check_flow_steering_support(dev)) {
 		ibdev->ib_dev.uverbs_ex_cmd_mask	|=
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
 		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;
 		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow;
 	} else {
 		pr_debug("Device managed flow steering is unavailable for this configuration.\n");
 	}
 	/*
 	 * End of experimental data
 	 */
 
 	mlx4_ib_alloc_eqs(dev, ibdev);
 
 	spin_lock_init(&iboe->lock);
 
 	if (init_node_data(ibdev))
 		goto err_map;
 
 	for (i = 0; i < ibdev->num_ports; ++i) {
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
 						IB_LINK_LAYER_ETHERNET) {
 			if (mlx4_is_slave(dev)) {
 				ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev,
 									       i + 1,
 									       &ibdev->counters[i].counter_index);
 			} else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */
 				ibdev->counters[i].counter_index = ((i + 1) << 1) - 1;
 				ibdev->counters[i].status = 0;
 			}
 
 			dev_info(&dev->pdev->dev,
 				 "%s: allocated counter index %d for port %d\n",
 				 __func__, ibdev->counters[i].counter_index, i+1);
 		} else {
 			ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX;
 			ibdev->counters[i].status = -ENOSPC;
 		}
 	}
 
 	spin_lock_init(&ibdev->sm_lock);
 	mutex_init(&ibdev->cap_mask_mutex);
 
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
 	    !mlx4_is_mfunc(dev)) {
 		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
 		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
 					    MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0);
 		if (err)
 			goto err_counter;
 
 		ibdev->ib_uc_qpns_bitmap =
 			kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) *
 				sizeof(long),
 				GFP_KERNEL);
 		if (!ibdev->ib_uc_qpns_bitmap) {
 			dev_err(&dev->pdev->dev, "bit map alloc failed\n");
 			goto err_steer_qp_release;
 		}
 
 		bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count);
 
 		err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base,
 				ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1);
 		if (err)
 			goto err_steer_free_bitmap;
 	}
 
 	if (ib_register_device(&ibdev->ib_dev, NULL))
 		goto err_steer_free_bitmap;
 
 	if (mlx4_ib_mad_init(ibdev))
 		goto err_reg;
 
 	if (mlx4_ib_init_sriov(ibdev))
 		goto err_mad;
 
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
 		if (!iboe->nb.notifier_call) {
 			iboe->nb.notifier_call = mlx4_ib_netdev_event;
 			err = register_netdevice_notifier(&iboe->nb);
 			if (err) {
 				iboe->nb.notifier_call = NULL;
 				goto err_notify;
 			}
 		}
 		if (!iboe->nb_inet.notifier_call) {
 			iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
 			err = register_inetaddr_notifier(&iboe->nb_inet);
 			if (err) {
 				iboe->nb_inet.notifier_call = NULL;
 				goto err_notify;
 			}
 		}
 		mlx4_ib_scan_netdevs(ibdev, NULL, 0);
 	}
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
 		if (device_create_file(&ibdev->ib_dev.dev,
 				       mlx4_class_attributes[j]))
 			goto err_notify;
 	}
 	if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group))
 		goto err_notify;
 
 	ibdev->ib_active = true;
 
 	if (mlx4_is_mfunc(ibdev->dev))
 		init_pkeys(ibdev);
 
 	/* create paravirt contexts for any VFs which are active */
 	if (mlx4_is_master(ibdev->dev)) {
 		for (j = 0; j < MLX4_MFUNC_MAX; j++) {
 			if (j == mlx4_master_func_num(ibdev->dev))
 				continue;
 			if (mlx4_is_slave_active(ibdev->dev, j))
 				do_slave_init(ibdev, j, 1);
 		}
 	}
 	return ibdev;
 
 err_notify:
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
                 device_remove_file(&ibdev->ib_dev.dev,
                         mlx4_class_attributes[j]);
         }
 
 	if (ibdev->iboe.nb.notifier_call) {
 		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
 			pr_warn("failure unregistering notifier\n");
 		ibdev->iboe.nb.notifier_call = NULL;
 	}
 	if (ibdev->iboe.nb_inet.notifier_call) {
 		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
 			pr_warn("failure unregistering notifier\n");
 		ibdev->iboe.nb_inet.notifier_call = NULL;
 	}
 	flush_workqueue(wq);
 
 	mlx4_ib_close_sriov(ibdev);
 
 err_mad:
 	mlx4_ib_mad_cleanup(ibdev);
 
 err_reg:
 	ib_unregister_device(&ibdev->ib_dev);
 
 err_steer_free_bitmap:
 	kfree(ibdev->ib_uc_qpns_bitmap);
 
 err_steer_qp_release:
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED)
 		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
 				ibdev->steer_qpn_count);
 err_counter:
 	for (; i; --i) {
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) ==
 						IB_LINK_LAYER_ETHERNET) {
 			mlx4_counter_free(ibdev->dev,
 					  i,
 					  ibdev->counters[i - 1].counter_index);
 		}
 	}
 
 err_map:
 	iounmap(ibdev->priv_uar.map);
 	mlx4_ib_free_eqs(dev, ibdev);
 
 err_uar:
 	mlx4_uar_free(dev, &ibdev->priv_uar);
 
 err_pd:
 	mlx4_pd_free(dev, ibdev->priv_pdn);
 
 err_dealloc:
 	ib_dealloc_device(&ibdev->ib_dev);
 
 	return NULL;
 }
 
 int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn)
 {
 	int offset;
 
 	WARN_ON(!dev->ib_uc_qpns_bitmap);
 
 	offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap,
 					 dev->steer_qpn_count,
 					 get_count_order(count));
 	if (offset < 0)
 		return offset;
 
 	*qpn = dev->steer_qpn_base + offset;
 	return 0;
 }
 
 void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
 {
 	if (!qpn ||
 	    dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
 		return;
 
 	BUG_ON(qpn < dev->steer_qpn_base);
 
 	bitmap_release_region(dev->ib_uc_qpns_bitmap,
 			qpn - dev->steer_qpn_base, get_count_order(count));
 }
 
 int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
 			 int is_attach)
 {
 	int err;
 	size_t flow_size;
 	struct ib_flow_attr *flow = NULL;
 	struct ib_flow_spec_ib *ib_spec;
 
 	if (is_attach) {
 		flow_size = sizeof(struct ib_flow_attr) +
 			    sizeof(struct ib_flow_spec_ib);
 		flow = kzalloc(flow_size, GFP_KERNEL);
 		if (!flow)
 			return -ENOMEM;
 		flow->port = mqp->port;
 		flow->num_of_specs = 1;
 		flow->size = flow_size;
 		ib_spec = (struct ib_flow_spec_ib *)(flow + 1);
 		ib_spec->type = IB_FLOW_SPEC_IB;
 		ib_spec->size = sizeof(struct ib_flow_spec_ib);
 		ib_spec->val.l3_type_qpn = mqp->ibqp.qp_num;
 		ib_spec->mask.l3_type_qpn = MLX4_IB_FLOW_QPN_MASK;
 
 		err = __mlx4_ib_create_flow(&mqp->ibqp, flow,
 					    IB_FLOW_DOMAIN_NIC,
 					    MLX4_FS_REGULAR,
 					    &mqp->reg_id);
 	} else {
 		err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
 	}
 	kfree(flow);
 	return err;
 }
 
 static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 {
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
 	int p, j;
 	int dev_idx, ret;
 
 	if (ibdev->iboe.nb_inet.notifier_call) {
 		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
 			pr_warn("failure unregistering notifier\n");
 		ibdev->iboe.nb_inet.notifier_call = NULL;
 	}
 
 	mlx4_ib_close_sriov(ibdev);
 	sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group);
 	mlx4_ib_mad_cleanup(ibdev);
 
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
 		device_remove_file(&ibdev->ib_dev.dev,
 			mlx4_class_attributes[j]);
 	}
 
 
 	dev_idx = -1;
 	if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) {
 		ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx);
 		if (ret != 1)
 			dev_idx = -1;
 	}
 	ib_unregister_device(&ibdev->ib_dev);
 	if (dev_idx >= 0) {
 		spin_lock(&dev_num_str_lock);
 		bitmap_release_region(dev_num_str_bitmap, dev_idx, 0);
 		spin_unlock(&dev_num_str_lock);
 	}
 
 	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
 				ibdev->steer_qpn_count);
 		kfree(ibdev->ib_uc_qpns_bitmap);
 	}
 
 	if (ibdev->iboe.nb.notifier_call) {
 		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
 			pr_warn("failure unregistering notifier\n");
 		ibdev->iboe.nb.notifier_call = NULL;
 	}
 	iounmap(ibdev->priv_uar.map);
 
 	for (p = 0; p < ibdev->num_ports; ++p) {
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) ==
 						IB_LINK_LAYER_ETHERNET) {
 			mlx4_counter_free(ibdev->dev,
 					  p + 1,
 					  ibdev->counters[p].counter_index);
 		}
 	}
 
 	mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
 		mlx4_CLOSE_PORT(dev, p);
 
 	mlx4_ib_free_eqs(dev, ibdev);
 
 	mlx4_uar_free(dev, &ibdev->priv_uar);
 	mlx4_pd_free(dev, ibdev->priv_pdn);
 	ib_dealloc_device(&ibdev->ib_dev);
 }
 
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
 {
 	struct mlx4_ib_demux_work **dm = NULL;
 	struct mlx4_dev *dev = ibdev->dev;
 	int i;
 	unsigned long flags;
 
 	if (!mlx4_is_master(dev))
 		return;
 
 	dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC);
 	if (!dm) {
 		pr_err("failed to allocate memory for tunneling qp update\n");
 		goto out;
 	}
 
 	for (i = 0; i < dev->caps.num_ports; i++) {
 		dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
 		if (!dm[i]) {
 			pr_err("failed to allocate memory for tunneling qp update work struct\n");
 			for (i = 0; i < dev->caps.num_ports; i++) {
 				if (dm[i])
 					kfree(dm[i]);
 			}
 			goto out;
 		}
 	}
 	/* initialize or tear down tunnel QPs for the slave */
 	for (i = 0; i < dev->caps.num_ports; i++) {
 		INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
 		dm[i]->port = i + 1;
 		dm[i]->slave = slave;
 		dm[i]->do_init = do_init;
 		dm[i]->dev = ibdev;
 		spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags);
 		if (!ibdev->sriov.is_going_down)
 			queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work);
 		spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
 	}
 out:
 	if (dm)
 		kfree(dm);
 	return;
 }
 
 static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
 			  enum mlx4_dev_event event, unsigned long param)
 {
 	struct ib_event ibev;
 	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
 	struct mlx4_eqe *eqe = NULL;
 	struct ib_event_work *ew;
 	int p = 0;
 
 	if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
 		eqe = (struct mlx4_eqe *)param;
 	else
 		p = (int) param;
 
 	switch (event) {
 	case MLX4_DEV_EVENT_PORT_UP:
 		if (p > ibdev->num_ports)
 			return;
 		if (mlx4_is_master(dev) &&
 		    rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
 			IB_LINK_LAYER_INFINIBAND) {
 			mlx4_ib_invalidate_all_guid_record(ibdev, p);
 		}
 		mlx4_ib_info((struct ib_device *) ibdev_ptr,
 			     "Port %d logical link is up\n", p);
 		ibev.event = IB_EVENT_PORT_ACTIVE;
 		break;
 
 	case MLX4_DEV_EVENT_PORT_DOWN:
 		if (p > ibdev->num_ports)
 			return;
 		mlx4_ib_info((struct ib_device *) ibdev_ptr,
 			     "Port %d logical link is down\n", p);
 		ibev.event = IB_EVENT_PORT_ERR;
 		break;
 
 	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
 		ibdev->ib_active = false;
 		ibev.event = IB_EVENT_DEVICE_FATAL;
 		break;
 
 	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
 		ew = kmalloc(sizeof *ew, GFP_ATOMIC);
 		if (!ew) {
 			pr_err("failed to allocate memory for events work\n");
 			break;
 		}
 
 		INIT_WORK(&ew->work, handle_port_mgmt_change_event);
 		memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
 		ew->ib_dev = ibdev;
 		/* need to queue only for port owner, which uses GEN_EQE */
 		if (mlx4_is_master(dev))
 			queue_work(wq, &ew->work);
 		else
 			handle_port_mgmt_change_event(&ew->work);
 		return;
 
 	case MLX4_DEV_EVENT_SLAVE_INIT:
 		/* here, p is the slave id */
 		do_slave_init(ibdev, p, 1);
 		return;
 
 	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
 		/* here, p is the slave id */
 		do_slave_init(ibdev, p, 0);
 		return;
 
 	default:
 		return;
 	}
 
 	ibev.device	      = ibdev_ptr;
 	ibev.element.port_num = (u8) p;
 
 	ib_dispatch_event(&ibev);
 }
 
 static struct mlx4_interface mlx4_ib_interface = {
 	.add		= mlx4_ib_add,
 	.remove		= mlx4_ib_remove,
 	.event		= mlx4_ib_event,
 	.protocol	= MLX4_PROT_IB_IPV6
 };
 
 static int __init mlx4_ib_init(void)
 {
 	int err;
 
 	wq = create_singlethread_workqueue("mlx4_ib");
 	if (!wq)
 		return -ENOMEM;
 
 	err = mlx4_ib_mcg_init();
 	if (err)
 		goto clean_proc;
 
 	init_dev_assign();
 
 	err = mlx4_register_interface(&mlx4_ib_interface);
 	if (err)
 		goto clean_mcg;
 
 	return 0;
 
 clean_mcg:
 	mlx4_ib_mcg_destroy();
 
 clean_proc:
 	destroy_workqueue(wq);
 	return err;
 }
 
 static void __exit mlx4_ib_cleanup(void)
 {
 	mlx4_unregister_interface(&mlx4_ib_interface);
 	mlx4_ib_mcg_destroy();
 	destroy_workqueue(wq);
 
 	kfree(dev_num_str_bitmap);
 }
 
 module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE);
 module_exit(mlx4_ib_cleanup);
 
 static int
 mlx4ib_evhand(module_t mod, int event, void *arg)
 {
 	return (0);
 }
 
 static moduledata_t mlx4ib_mod = {
 	.name = "mlx4ib",
 	.evhand = mlx4ib_evhand,
 };
 
-DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_LAST, SI_ORDER_ANY);
 MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1);
 MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1);
 MODULE_DEPEND(mlx4ib, linuxkpi, 1, 1, 1);
Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c	(revision 296687)
+++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c	(revision 296688)
@@ -1,1540 +1,1540 @@
 /*
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 
 #include "ipoib.h"
 
 static	int ipoib_resolvemulti(struct ifnet *, struct sockaddr **,
 		struct sockaddr *);
 
 
 #include <linux/module.h>
 
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 
 #include <linux/if_arp.h>	/* For ARPHRD_xxx */
 #include <linux/if_vlan.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
 
 int ipoib_sendq_size = IPOIB_TX_RING_SIZE;
 int ipoib_recvq_size = IPOIB_RX_RING_SIZE;
 
 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level = 1;
 
 module_param_named(debug_level, ipoib_debug_level, int, 0644);
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 #endif
 
 struct ipoib_path_iter {
 	struct ipoib_dev_priv *priv;
 	struct ipoib_path  path;
 };
 
 static const u8 ipv4_bcast_addr[] = {
 	0x00, 0xff, 0xff, 0xff,
 	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
 	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
 };
 
 struct workqueue_struct *ipoib_workqueue;
 
 struct ib_sa_client ipoib_sa_client;
 
 static void ipoib_add_one(struct ib_device *device);
 static void ipoib_remove_one(struct ib_device *device);
 static void ipoib_start(struct ifnet *dev);
 static int ipoib_output(struct ifnet *ifp, struct mbuf *m,
 	    const struct sockaddr *dst, struct route *ro);
 static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
 static void ipoib_input(struct ifnet *ifp, struct mbuf *m);
 
 #define	IPOIB_MTAP(_ifp, _m)					\
 do {								\
 	if (bpf_peers_present((_ifp)->if_bpf)) {		\
 		M_ASSERTVALID(_m);				\
 		ipoib_mtap_mb((_ifp), (_m));			\
 	}							\
 } while (0)
 
 /*
  * This is for clients that have an ipoib_header in the mbuf.
  */
 static void
 ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb)
 {
 	struct ipoib_header *ih;
 	struct ether_header eh;
 
 	ih = mtod(mb, struct ipoib_header *);
 	eh.ether_type = ih->proto;
 	bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN);
 	bzero(&eh.ether_shost, ETHER_ADDR_LEN);
 	mb->m_data += sizeof(struct ipoib_header);
 	mb->m_len -= sizeof(struct ipoib_header);
 	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
 	mb->m_data -= sizeof(struct ipoib_header);
 	mb->m_len += sizeof(struct ipoib_header);
 }
 
 void
 ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto)
 {
 	struct ether_header eh;
 
 	eh.ether_type = proto;
 	bzero(&eh.ether_shost, ETHER_ADDR_LEN);
 	bzero(&eh.ether_dhost, ETHER_ADDR_LEN);
 	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
 }
 
 static struct ib_client ipoib_client = {
 	.name   = "ipoib",
 	.add    = ipoib_add_one,
 	.remove = ipoib_remove_one
 };
 
 int
 ipoib_open(struct ipoib_dev_priv *priv)
 {
 	struct ifnet *dev = priv->dev;
 
 	ipoib_dbg(priv, "bringing up interface\n");
 
 	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
 	if (ipoib_pkey_dev_delay_open(priv))
 		return 0;
 
 	if (ipoib_ib_dev_open(priv))
 		goto err_disable;
 
 	if (ipoib_ib_dev_up(priv))
 		goto err_stop;
 
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
 
 		/* Bring up any child interfaces too */
 		mutex_lock(&priv->vlan_mutex);
 		list_for_each_entry(cpriv, &priv->child_intfs, list)
 			if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
 				ipoib_open(cpriv);
 		mutex_unlock(&priv->vlan_mutex);
 	}
 	dev->if_drv_flags |= IFF_DRV_RUNNING;
 	dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 	return 0;
 
 err_stop:
 	ipoib_ib_dev_stop(priv, 1);
 
 err_disable:
 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
 	return -EINVAL;
 }
 
 static void
 ipoib_init(void *arg)
 {
 	struct ifnet *dev;
 	struct ipoib_dev_priv *priv;
 
 	priv = arg;
 	dev = priv->dev;
 	if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		ipoib_open(priv);
 	queue_work(ipoib_workqueue, &priv->flush_light);
 }
 
 
 static int
 ipoib_stop(struct ipoib_dev_priv *priv)
 {
 	struct ifnet *dev = priv->dev;
 
 	ipoib_dbg(priv, "stopping interface\n");
 
 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
 	dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 
 	ipoib_ib_dev_down(priv, 0);
 	ipoib_ib_dev_stop(priv, 0);
 
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
 
 		/* Bring down any child interfaces too */
 		mutex_lock(&priv->vlan_mutex);
 		list_for_each_entry(cpriv, &priv->child_intfs, list)
 			if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0)
 				ipoib_stop(cpriv);
 		mutex_unlock(&priv->vlan_mutex);
 	}
 
 	return 0;
 }
 
 int
 ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu)
 {
 	struct ifnet *dev = priv->dev;
 
 	/* dev->if_mtu > 2K ==> connected mode */
 	if (ipoib_cm_admin_enabled(priv)) {
 		if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)))
 			return -EINVAL;
 
 		if (new_mtu > priv->mcast_mtu)
 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 				   priv->mcast_mtu);
 
 		dev->if_mtu = new_mtu;
 		return 0;
 	}
 
 	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 		return -EINVAL;
 
 	priv->admin_mtu = new_mtu;
 
 	dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu);
 
 	queue_work(ipoib_workqueue, &priv->flush_light);
 
 	return 0;
 }
 
 static int
 ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ipoib_dev_priv *priv = ifp->if_softc;
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFFLAGS:
 		if (ifp->if_flags & IFF_UP) {
 			if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 				error = -ipoib_open(priv);
 		} else
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				ipoib_stop(priv);
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			queue_work(ipoib_workqueue, &priv->restart_task);
 		break;
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		{
 			struct sockaddr *sa;
 
 			sa = (struct sockaddr *) & ifr->ifr_data;
 			bcopy(IF_LLADDR(ifp),
 			      (caddr_t) sa->sa_data, INFINIBAND_ALEN);
 		}
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		error = -ipoib_change_mtu(priv, ifr->ifr_mtu);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 
 static struct ipoib_path *
 __path_find(struct ipoib_dev_priv *priv, void *gid)
 {
 	struct rb_node *n = priv->path_tree.rb_node;
 	struct ipoib_path *path;
 	int ret;
 
 	while (n) {
 		path = rb_entry(n, struct ipoib_path, rb_node);
 
 		ret = memcmp(gid, path->pathrec.dgid.raw,
 			     sizeof (union ib_gid));
 
 		if (ret < 0)
 			n = n->rb_left;
 		else if (ret > 0)
 			n = n->rb_right;
 		else
 			return path;
 	}
 
 	return NULL;
 }
 
 static int
 __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path)
 {
 	struct rb_node **n = &priv->path_tree.rb_node;
 	struct rb_node *pn = NULL;
 	struct ipoib_path *tpath;
 	int ret;
 
 	while (*n) {
 		pn = *n;
 		tpath = rb_entry(pn, struct ipoib_path, rb_node);
 
 		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
 			     sizeof (union ib_gid));
 		if (ret < 0)
 			n = &pn->rb_left;
 		else if (ret > 0)
 			n = &pn->rb_right;
 		else
 			return -EEXIST;
 	}
 
 	rb_link_node(&path->rb_node, pn, n);
 	rb_insert_color(&path->rb_node, &priv->path_tree);
 
 	list_add_tail(&path->list, &priv->path_list);
 
 	return 0;
 }
 
 void
 ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path)
 {
 
 	_IF_DRAIN(&path->queue);
 
 	if (path->ah)
 		ipoib_put_ah(path->ah);
 	if (ipoib_cm_get(path))
 		ipoib_cm_destroy_tx(ipoib_cm_get(path));
 
 	kfree(path);
 }
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 
 struct ipoib_path_iter *
 ipoib_path_iter_init(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_path_iter *iter;
 
 	iter = kmalloc(sizeof *iter, GFP_KERNEL);
 	if (!iter)
 		return NULL;
 
 	iter->priv = priv;
 	memset(iter->path.pathrec.dgid.raw, 0, 16);
 
 	if (ipoib_path_iter_next(iter)) {
 		kfree(iter);
 		return NULL;
 	}
 
 	return iter;
 }
 
 int
 ipoib_path_iter_next(struct ipoib_path_iter *iter)
 {
 	struct ipoib_dev_priv *priv = iter->priv;
 	struct rb_node *n;
 	struct ipoib_path *path;
 	int ret = 1;
 
 	spin_lock_irq(&priv->lock);
 
 	n = rb_first(&priv->path_tree);
 
 	while (n) {
 		path = rb_entry(n, struct ipoib_path, rb_node);
 
 		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
 			   sizeof (union ib_gid)) < 0) {
 			iter->path = *path;
 			ret = 0;
 			break;
 		}
 
 		n = rb_next(n);
 	}
 
 	spin_unlock_irq(&priv->lock);
 
 	return ret;
 }
 
 void
 ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path)
 {
 	*path = iter->path;
 }
 
 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 
 void
 ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_path *path, *tp;
 
 	spin_lock_irq(&priv->lock);
 
 	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
 		ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n",
 			be16_to_cpu(path->pathrec.dlid),
 			path->pathrec.dgid.raw, ":");
 		path->valid =  0;
 	}
 
 	spin_unlock_irq(&priv->lock);
 }
 
 void
 ipoib_flush_paths(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_path *path, *tp;
 	LIST_HEAD(remove_list);
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->lock, flags);
 
 	list_splice_init(&priv->path_list, &remove_list);
 
 	list_for_each_entry(path, &remove_list, list)
 		rb_erase(&path->rb_node, &priv->path_tree);
 
 	list_for_each_entry_safe(path, tp, &remove_list, list) {
 		if (path->query)
 			ib_sa_cancel_query(path->query_id, path->query);
 		spin_unlock_irqrestore(&priv->lock, flags);
 		wait_for_completion(&path->done);
 		ipoib_path_free(priv, path);
 		spin_lock_irqsave(&priv->lock, flags);
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static void
 path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr)
 {
 	struct ipoib_path *path = path_ptr;
 	struct ipoib_dev_priv *priv = path->priv;
 	struct ifnet *dev = priv->dev;
 	struct ipoib_ah *ah = NULL;
 	struct ipoib_ah *old_ah = NULL;
 	struct ifqueue mbqueue;
 	struct mbuf *mb;
 	unsigned long flags;
 
 	if (!status)
 		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n",
 			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":");
 	else
 		ipoib_dbg(priv, "PathRec status %d for GID %16D\n",
 			  status, path->pathrec.dgid.raw, ":");
 
 	bzero(&mbqueue, sizeof(mbqueue));
 
 	if (!status) {
 		struct ib_ah_attr av;
 
 		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
 			ah = ipoib_create_ah(priv, priv->pd, &av);
 	}
 
 	spin_lock_irqsave(&priv->lock, flags);
 
 	if (ah) {
 		path->pathrec = *pathrec;
 
 		old_ah   = path->ah;
 		path->ah = ah;
 
 		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
 			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
 
 		for (;;) {
 			_IF_DEQUEUE(&path->queue, mb);
 			if (mb == NULL)
 				break;
 			_IF_ENQUEUE(&mbqueue, mb);
 		}
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 		if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path))
 			ipoib_cm_set(path, ipoib_cm_create_tx(priv, path));
 #endif
 
 		path->valid = 1;
 	}
 
 	path->query = NULL;
 	complete(&path->done);
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	if (old_ah)
 		ipoib_put_ah(old_ah);
 
 	for (;;) {
 		_IF_DEQUEUE(&mbqueue, mb);
 		if (mb == NULL)
 			break;
 		mb->m_pkthdr.rcvif = dev;
 		if (dev->if_transmit(dev, mb))
 			ipoib_warn(priv, "dev_queue_xmit failed "
 				   "to requeue packet\n");
 	}
 }
 
 static struct ipoib_path *
 path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
 {
 	struct ipoib_path *path;
 
 	if (!priv->broadcast)
 		return NULL;
 
 	path = kzalloc(sizeof *path, GFP_ATOMIC);
 	if (!path)
 		return NULL;
 
 	path->priv = priv;
 
 	bzero(&path->queue, sizeof(path->queue));
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN);
 #endif
 	memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid));
 	path->pathrec.sgid	    = priv->local_gid;
 	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
 	path->pathrec.numb_path     = 1;
 	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 
 	return path;
 }
 
 static int
 path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path)
 {
 	struct ifnet *dev = priv->dev;
 
 	ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;
 	struct ib_sa_path_rec p_rec;
 
 	p_rec = path->pathrec;
 	p_rec.mtu_selector = IB_SA_GT;
 
 	switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) {
 	case 512:
 		p_rec.mtu = IB_MTU_256;
 		break;
 	case 1024:
 		p_rec.mtu = IB_MTU_512;
 		break;
 	case 2048:
 		p_rec.mtu = IB_MTU_1024;
 		break;
 	case 4096:
 		p_rec.mtu = IB_MTU_2048;
 		break;
 	default:
 		/* Wildcard everything */
 		comp_mask = 0;
 		p_rec.mtu = 0;
 		p_rec.mtu_selector = 0;
 	}
 
 	ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n",
 		  p_rec.dgid.raw, ":",
 		  comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);
 
 	init_completion(&path->done);
 
 	path->query_id =
 		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
 				   &p_rec, comp_mask		|
 				   IB_SA_PATH_REC_DGID		|
 				   IB_SA_PATH_REC_SGID		|
 				   IB_SA_PATH_REC_NUMB_PATH	|
 				   IB_SA_PATH_REC_TRAFFIC_CLASS |
 				   IB_SA_PATH_REC_PKEY,
 				   1000, GFP_ATOMIC,
 				   path_rec_completion,
 				   path, &path->query);
 	if (path->query_id < 0) {
 		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
 		path->query = NULL;
 		complete(&path->done);
 		return path->query_id;
 	}
 
 	return 0;
 }
 
 static void
 ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh)
 {
 	struct ipoib_path *path;
 
 	path = __path_find(priv, eh->hwaddr + 4);
 	if (!path || !path->valid) {
 		int new_path = 0;
 
 		if (!path) {
 			path = path_rec_create(priv, eh->hwaddr);
 			new_path = 1;
 		}
 		if (path) {
 			_IF_ENQUEUE(&path->queue, mb);
 			if (!path->query && path_rec_start(priv, path)) {
 				spin_unlock_irqrestore(&priv->lock, flags);
 				if (new_path)
 					ipoib_path_free(priv, path);
 				return;
 			} else
 				__path_add(priv, path);
 		} else {
 			if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
 			m_freem(mb);
 		}
 
 		return;
 	}
 
 	if (ipoib_cm_get(path) && ipoib_cm_up(path)) {
 		ipoib_cm_send(priv, mb, ipoib_cm_get(path));
 	} else if (path->ah) {
 		ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr));
 	} else if ((path->query || !path_rec_start(priv, path)) &&
 		    path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) {
 		_IF_ENQUEUE(&path->queue, mb);
 	} else {
 		if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1);
 		m_freem(mb);
 	}
 }
 
 static int
 ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb)
 {
 	struct ipoib_header *eh;
 
 	eh = mtod(mb, struct ipoib_header *);
 	if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
 		/* Add in the P_Key for multicast*/
 		eh->hwaddr[8] = (priv->pkey >> 8) & 0xff;
 		eh->hwaddr[9] = priv->pkey & 0xff;
 
 		ipoib_mcast_send(priv, eh->hwaddr + 4, mb);
 	} else
 		ipoib_unicast_send(mb, priv, eh);
 
 	return 0;
 }
 
 
 static void
 _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv)
 {
 	struct mbuf *mb;
 
 	if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
 	    IFF_DRV_RUNNING)
 		return;
 
 	spin_lock(&priv->lock);
 	while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) &&
 	    (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
 		IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
 		if (mb == NULL)
 			break;
 		IPOIB_MTAP(dev, mb);
 		ipoib_send_one(priv, mb);
 	}
 	spin_unlock(&priv->lock);
 }
 
 static void
 ipoib_start(struct ifnet *dev)
 {
 	_ipoib_start(dev, dev->if_softc);
 }
 
 static void
 ipoib_vlan_start(struct ifnet *dev)
 {
 	struct ipoib_dev_priv *priv;
 	struct mbuf *mb;
 
 	priv = VLAN_COOKIE(dev);
 	if (priv != NULL)
 		return _ipoib_start(dev, priv);
 	while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) {
 		IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
 		if (mb == NULL)
 			break;
 		m_freem(mb);
 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
 	}
 }
 
 int
 ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
 {
 
 	/* Allocate RX/TX "rings" to hold queued mbs */
 	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
 				GFP_KERNEL);
 	if (!priv->rx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
 		       ca->name, ipoib_recvq_size);
 		goto out;
 	}
 
 	priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL);
 	if (!priv->tx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
 		       ca->name, ipoib_sendq_size);
 		goto out_rx_ring_cleanup;
 	}
 	memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
 
 	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
 	if (ipoib_ib_dev_init(priv, ca, port))
 		goto out_tx_ring_cleanup;
 
 	return 0;
 
 out_tx_ring_cleanup:
 	kfree(priv->tx_ring);
 
 out_rx_ring_cleanup:
 	kfree(priv->rx_ring);
 
 out:
 	return -ENOMEM;
 }
 
 static void
 ipoib_detach(struct ipoib_dev_priv *priv)
 {
 	struct ifnet *dev;
 
 	dev = priv->dev;
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		bpfdetach(dev);
 		if_detach(dev);
 		if_free(dev);
 	} else
 		VLAN_SETCOOKIE(priv->dev, NULL);
 
 	free(priv, M_TEMP);
 }
 
 void
 ipoib_dev_cleanup(struct ipoib_dev_priv *priv)
 {
 	struct ipoib_dev_priv *cpriv, *tcpriv;
 
 	/* Delete any child interfaces first */
 	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
 		ipoib_dev_cleanup(cpriv);
 		ipoib_detach(cpriv);
 	}
 
 	ipoib_ib_dev_cleanup(priv);
 
 	kfree(priv->rx_ring);
 	kfree(priv->tx_ring);
 
 	priv->rx_ring = NULL;
 	priv->tx_ring = NULL;
 }
 
 static volatile int ipoib_unit;
 
 static struct ipoib_dev_priv *
 ipoib_priv_alloc(void)
 {
 	struct ipoib_dev_priv *priv;
 
 	priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);
 	spin_lock_init(&priv->lock);
 	spin_lock_init(&priv->drain_lock);
 	mutex_init(&priv->vlan_mutex);
 	INIT_LIST_HEAD(&priv->path_list);
 	INIT_LIST_HEAD(&priv->child_intfs);
 	INIT_LIST_HEAD(&priv->dead_ahs);
 	INIT_LIST_HEAD(&priv->multicast_list);
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
 	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 	memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN);
 
 	return (priv);
 }
 
 struct ipoib_dev_priv *
 ipoib_intf_alloc(const char *name)
 {
 	struct ipoib_dev_priv *priv;
 	struct sockaddr_dl *sdl;
 	struct ifnet *dev;
 
 	priv = ipoib_priv_alloc();
 	dev = priv->dev = if_alloc(IFT_INFINIBAND);
 	if (!dev) {
 		free(priv, M_TEMP);
 		return NULL;
 	}
 	dev->if_softc = priv;
 	if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1));
 	dev->if_flags = IFF_BROADCAST | IFF_MULTICAST;
 	dev->if_addrlen = INFINIBAND_ALEN;
 	dev->if_hdrlen = IPOIB_HEADER_LEN;
 	if_attach(dev);
 	dev->if_init = ipoib_init;
 	dev->if_ioctl = ipoib_ioctl;
 	dev->if_start = ipoib_start;
 	dev->if_output = ipoib_output;
 	dev->if_input = ipoib_input;
 	dev->if_resolvemulti = ipoib_resolvemulti;
 	dev->if_baudrate = IF_Gbps(10);
 	dev->if_broadcastaddr = priv->broadcastaddr;
 	dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2;
 	sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr;
 	sdl->sdl_type = IFT_INFINIBAND;
 	sdl->sdl_alen = dev->if_addrlen;
 	priv->dev = dev;
 	if_link_state_change(dev, LINK_STATE_DOWN);
 	bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN);
 
 	return dev->if_softc;
 }
 
 int
 ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
 {
 	struct ib_device_attr *device_attr;
 	int result = -ENOMEM;
 
 	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
 	if (!device_attr) {
 		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
 		       hca->name, sizeof *device_attr);
 		return result;
 	}
 
 	result = ib_query_device(hca, device_attr);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
 		       hca->name, result);
 		kfree(device_attr);
 		return result;
 	}
 	priv->hca_caps = device_attr->device_cap_flags;
 
 	kfree(device_attr);
 
 	priv->dev->if_hwassist = 0;
 	priv->dev->if_capabilities = 0;
 
 #ifndef CONFIG_INFINIBAND_IPOIB_CM
 	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
 		set_bit(IPOIB_FLAG_CSUM, &priv->flags);
 		priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP;
 		priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
 	}
 
 #if 0
 	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) {
 		priv->dev->if_capabilities |= IFCAP_TSO4;
 		priv->dev->if_hwassist |= CSUM_TSO;
 	}
 #endif
 #endif
 	priv->dev->if_capabilities |=
 	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
 	priv->dev->if_capenable = priv->dev->if_capabilities;
 
 	return 0;
 }
 
 
 static struct ifnet *
 ipoib_add_port(const char *format, struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
 	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format);
 	if (!priv)
 		goto alloc_mem_failed;
 
 	if (!ib_query_port(hca, port, &attr))
 		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
 	else {
 		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
 		       hca->name, port);
 		goto device_init_failed;
 	}
 
 	/* MTU will be reset when mcast join happens */
 	priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
 	priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu;
 
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
 		       hca->name, port, result);
 		goto device_init_failed;
 	}
 
 	if (ipoib_set_dev_features(priv, hca))
 		goto device_init_failed;
 
 	/*
 	 * Set the full membership bit, so that we join the right
 	 * broadcast group, etc.
 	 */
 	priv->pkey |= 0x8000;
 
 	priv->broadcastaddr[8] = priv->pkey >> 8;
 	priv->broadcastaddr[9] = priv->pkey & 0xff;
 
 	result = ib_query_gid(hca, port, 0, &priv->local_gid);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
 		       hca->name, port, result);
 		goto device_init_failed;
 	}
 	memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
 	result = ipoib_dev_init(priv, hca, port);
 	if (result < 0) {
 		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
 		       hca->name, port, result);
 		goto device_init_failed;
 	}
 	if (ipoib_cm_admin_enabled(priv))
 		priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv));
 
 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
 			      priv->ca, ipoib_event);
 	result = ib_register_event_handler(&priv->event_handler);
 	if (result < 0) {
 		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
 		       "port %d (ret = %d)\n",
 		       hca->name, port, result);
 		goto event_failed;
 	}
 	if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port);
 
 	return priv->dev;
 
 event_failed:
 	ipoib_dev_cleanup(priv);
 
 device_init_failed:
 	ipoib_detach(priv);
 
 alloc_mem_failed:
 	return ERR_PTR(result);
 }
 
 static void
 ipoib_add_one(struct ib_device *device)
 {
 	struct list_head *dev_list;
 	struct ifnet *dev;
 	struct ipoib_dev_priv *priv;
 	int s, e, p;
 
 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
 		return;
 
 	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
 	if (!dev_list)
 		return;
 
 	INIT_LIST_HEAD(dev_list);
 
 	if (device->node_type == RDMA_NODE_IB_SWITCH) {
 		s = 0;
 		e = 0;
 	} else {
 		s = 1;
 		e = device->phys_port_cnt;
 	}
 
 	for (p = s; p <= e; ++p) {
 		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
 			continue;
 		dev = ipoib_add_port("ib", device, p);
 		if (!IS_ERR(dev)) {
 			priv = dev->if_softc;
 			list_add_tail(&priv->list, dev_list);
 		}
 	}
 
 	ib_set_client_data(device, &ipoib_client, dev_list);
 }
 
 static void
 ipoib_remove_one(struct ib_device *device)
 {
 	struct ipoib_dev_priv *priv, *tmp;
 	struct list_head *dev_list;
 
 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
 		return;
 
 	dev_list = ib_get_client_data(device, &ipoib_client);
 
 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
 		if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
 			continue;
 
 		ipoib_stop(priv);
 
 		ib_unregister_event_handler(&priv->event_handler);
 
 		/* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */
 
 		flush_workqueue(ipoib_workqueue);
 
 		ipoib_dev_cleanup(priv);
 		ipoib_detach(priv);
 	}
 
 	kfree(dev_list);
 }
 
 static void
 ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct ipoib_dev_priv *parent;
 	struct ipoib_dev_priv *priv;
 	struct ifnet *dev;
 	uint16_t pkey;
 	int error;
 
 	if (ifp->if_type != IFT_INFINIBAND)
 		return;
 	dev = VLAN_DEVAT(ifp, vtag);
 	if (dev == NULL)
 		return;
 	priv = NULL;
 	error = 0;
 	parent = ifp->if_softc;
 	/* We only support 15 bits of pkey. */
 	if (vtag & 0x8000)
 		return;
 	pkey = vtag | 0x8000;	/* Set full membership bit. */
 	if (pkey == parent->pkey)
 		return;
 	/* Check for dups */
 	mutex_lock(&parent->vlan_mutex);
 	list_for_each_entry(priv, &parent->child_intfs, list) {
 		if (priv->pkey == pkey) {
 			priv = NULL;
 			error = EBUSY;
 			goto out;
 		}
 	}
 	priv = ipoib_priv_alloc();
 	priv->dev = dev;
 	priv->max_ib_mtu = parent->max_ib_mtu;
 	priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu;
 	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
 	error = ipoib_set_dev_features(priv, parent->ca);
 	if (error)
 		goto out;
 	priv->pkey = pkey;
 	priv->broadcastaddr[8] = pkey >> 8;
 	priv->broadcastaddr[9] = pkey & 0xff;
 	dev->if_broadcastaddr = priv->broadcastaddr;
 	error = ipoib_dev_init(priv, parent->ca, parent->port);
 	if (error)
 		goto out;
 	priv->parent = parent->dev;
 	list_add_tail(&priv->list, &parent->child_intfs);
 	VLAN_SETCOOKIE(dev, priv);
 	dev->if_start = ipoib_vlan_start;
 	dev->if_drv_flags &= ~IFF_DRV_RUNNING;
 	dev->if_hdrlen = IPOIB_HEADER_LEN;
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		ipoib_open(priv);
 	mutex_unlock(&parent->vlan_mutex);
 	return;
 out:
 	mutex_unlock(&parent->vlan_mutex);
 	if (priv)
 		free(priv, M_TEMP);
 	if (error)
 		ipoib_warn(parent,
 		    "failed to initialize subinterface: device %s, port %d vtag 0x%X",
 		    parent->ca->name, parent->port, vtag);
 	return;
 }
 
 static void
 ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
 {
 	struct ipoib_dev_priv *parent;
 	struct ipoib_dev_priv *priv;
 	struct ifnet *dev;
 	uint16_t pkey;
 
 	if (ifp->if_type != IFT_INFINIBAND)
 		return;
 
 	dev = VLAN_DEVAT(ifp, vtag);
 	if (dev)
 		VLAN_SETCOOKIE(dev, NULL);
 	pkey = vtag | 0x8000;
 	parent = ifp->if_softc;
 	mutex_lock(&parent->vlan_mutex);
 	list_for_each_entry(priv, &parent->child_intfs, list) {
 		if (priv->pkey == pkey) {
 			ipoib_dev_cleanup(priv);
 			list_del(&priv->list);
 			break;
 		}
 	}
 	mutex_unlock(&parent->vlan_mutex);
 }
 
 eventhandler_tag ipoib_vlan_attach;
 eventhandler_tag ipoib_vlan_detach;
 
 static int __init
 ipoib_init_module(void)
 {
 	int ret;
 
 	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
 	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
 	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
 
 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
 	ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
 						     IPOIB_MIN_QUEUE_SIZE));
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
 #endif
 
 	ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST);
 	ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST);
 
 	/*
 	 * We create our own workqueue mainly because we want to be
 	 * able to flush it when devices are being removed.  We can't
 	 * use schedule_work()/flush_scheduled_work() because both
 	 * unregister_netdev() and linkwatch_event take the rtnl lock,
 	 * so flush_scheduled_work() can deadlock during device
 	 * removal.
 	 */
 	ipoib_workqueue = create_singlethread_workqueue("ipoib");
 	if (!ipoib_workqueue) {
 		ret = -ENOMEM;
 		goto err_fs;
 	}
 
 	ib_sa_register_client(&ipoib_sa_client);
 
 	ret = ib_register_client(&ipoib_client);
 	if (ret)
 		goto err_sa;
 
 	return 0;
 
 err_sa:
 	ib_sa_unregister_client(&ipoib_sa_client);
 	destroy_workqueue(ipoib_workqueue);
 
 err_fs:
 	return ret;
 }
 
 static void __exit
 ipoib_cleanup_module(void)
 {
 
 	EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach);
 	EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach);
 	ib_unregister_client(&ipoib_client);
 	ib_sa_unregister_client(&ipoib_sa_client);
 	destroy_workqueue(ipoib_workqueue);
 }
 
 /*
  * Infiniband output routine.
  */
 static int
 ipoib_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	u_char edst[INFINIBAND_ALEN];
 #if defined(INET) || defined(INET6)
 	struct llentry *lle = NULL;
 #endif
 	struct ipoib_header *eh;
 	int error = 0, is_gw = 0;
 	short type;
 
 	if (ro != NULL)
 		is_gw = (ro->ro_flags & RT_HAS_GW) != 0;
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		goto bad;
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR) {
 		error = ENETDOWN;
 		goto bad;
 	}
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		error = ENETDOWN;
 		goto bad;
 	}
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (lle != NULL && (lle->la_flags & LLE_VALID))
 			memcpy(edst, lle->ll_addr, sizeof(edst));
 		else if (m->m_flags & M_MCAST)
 			ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst);
 		else
 			error = arpresolve(ifp, is_gw, m, dst, edst, NULL);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		type = htons(ETHERTYPE_IP);
 		break;
 	case AF_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			type = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			type = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (m->m_flags & M_BCAST)
 			bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN);
 		else
 			bcopy(ar_tha(ah), edst, INFINIBAND_ALEN);
 
 	}
 	break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (lle != NULL && (lle->la_flags & LLE_VALID))
 			memcpy(edst, lle->ll_addr, sizeof(edst));
 		else if (m->m_flags & M_MCAST)
 			ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst);
 		else
 			error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL);
 		if (error)
 			return error;
 		type = htons(ETHERTYPE_IPV6);
 		break;
 #endif
 
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		error = EAFNOSUPPORT;
 		goto bad;
 	}
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto bad;
 	}
 	eh = mtod(m, struct ipoib_header *);
 	(void)memcpy(&eh->proto, &type, sizeof(eh->proto));
 	(void)memcpy(&eh->hwaddr, edst, sizeof (edst));
 
 	/*
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
 	return ((ifp->if_transmit)(ifp, m));
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Upper layer processing for a received Infiniband packet.
  */
 void
 ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto)
 {
 	int isr;
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		if_printf(ifp, "discard frame at IFF_MONITOR\n");
 		m_freem(m);
 		return;
 	}
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (proto) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			return;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		goto discard;
 	}
 	netisr_dispatch(isr, m);
 	return;
 
 discard:
 	m_freem(m);
 }
 
 /*
  * Process a received Infiniband packet.
  */
 static void
 ipoib_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ipoib_header *eh;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	/* Let BPF have it before we strip the header. */
 	IPOIB_MTAP(ifp, m);
 	eh = mtod(m, struct ipoib_header *);
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper layers.
 	 * Strip off Infiniband header.
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 	m_adj(m, IPOIB_HEADER_LEN);
 
 	if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
 		if (memcmp(eh->hwaddr, ifp->if_broadcastaddr,
 		    ifp->if_addrlen) == 0)
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 	ipoib_demux(ifp, m, ntohs(eh->proto));
 	CURVNET_RESTORE();
 }
 
 static int
 ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
 	struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!IPOIB_IS_MULTICAST(e_addr))
 			return EADDRNOTAVAIL;
 		*llsa = 0;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
 		sdl->sdl_alen = INFINIBAND_ALEN;
 		e_addr = LLADDR(sdl);
 		ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr,
 		    e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		/*
 		 * An IP6 address of 0 means listen to all
 		 * of the multicast address used for IP6.  
 		 * This has no meaning in ipoib.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
 		sdl->sdl_alen = INFINIBAND_ALEN;
 		e_addr = LLADDR(sdl);
 		ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 
 	default:
 		return EAFNOSUPPORT;
 	}
 }
 
 module_init(ipoib_init_module);
 module_exit(ipoib_cleanup_module);
 
 static int
 ipoib_evhand(module_t mod, int event, void *arg)
 {
 	                return (0);
 }
 
 static moduledata_t ipoib_mod = {
 	                .name = "ipoib",
 			                .evhand = ipoib_evhand,
 };
 
-DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_SMP, SI_ORDER_ANY);
+DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY);
 MODULE_DEPEND(ipoib, ibcore, 1, 1, 1);
 MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1);