Index: user/alc/PQ_LAUNDRY/include/xlocale/_locale.h
===================================================================
--- user/alc/PQ_LAUNDRY/include/xlocale/_locale.h	(revision 303516)
+++ user/alc/PQ_LAUNDRY/include/xlocale/_locale.h	(revision 303517)
@@ -1,56 +1,56 @@
 /*-
  * Copyright (c) 2011, 2012 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by David Chisnall under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _XLOCALE_LOCALE_H
 #define _XLOCALE_LOCALE_H
 
 #define LC_COLLATE_MASK  (1<<0)
 #define LC_CTYPE_MASK    (1<<1)
 #define LC_MESSAGES_MASK (1<<2)
 #define LC_MONETARY_MASK (1<<3)
 #define LC_NUMERIC_MASK  (1<<4)
 #define LC_TIME_MASK     (1<<5)
 #define LC_ALL_MASK      (LC_COLLATE_MASK | LC_CTYPE_MASK | LC_MESSAGES_MASK | \
 			  LC_MONETARY_MASK | LC_NUMERIC_MASK | LC_TIME_MASK)
 #define LC_GLOBAL_LOCALE ((locale_t)-1)
 
 #ifndef _LOCALE_T_DEFINED
 #define _LOCALE_T_DEFINED
 typedef struct	_xlocale *locale_t;
 #endif
 
 locale_t	 duplocale(locale_t base);
-int		 freelocale(locale_t loc);
+void		 freelocale(locale_t loc);
 locale_t	 newlocale(int mask, const char *locale, locale_t base);
 const char	*querylocale(int mask, locale_t loc);
 locale_t	 uselocale(locale_t loc);
 
 #endif /* _XLOCALE_LOCALE_H */
Index: user/alc/PQ_LAUNDRY/lib/libc/locale/freelocale.3
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libc/locale/freelocale.3	(revision 303516)
+++ user/alc/PQ_LAUNDRY/lib/libc/locale/freelocale.3	(revision 303517)
@@ -1,68 +1,59 @@
 .\" Copyright (c) 2011 The FreeBSD Foundation
 .\" All rights reserved.
 .\"
 .\" This documentation was written by David Chisnall under sponsorship from
 .\" the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
-.Dd September 17, 2011
+.Dd July 26, 2016
 .Dt FREELOCALE 3
 .Os
 .Sh NAME
 .Nm freelocale
 .Nd Frees a locale created with
 .Xr duplocale 3
 or
 .Xr newlocale 3
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In locale.h
-.Ft int
+.Ft void
 .Fn freelocale "locale_t locale"
 .Sh DESCRIPTION
 Frees a
 .Fa locale_t .
 This relinquishes any resources held exclusively by this locale.
 Note that locales share reference-counted components,
 so a call to this function is not guaranteed to free all of the components.
-.Sh RETURN VALUES
-Returns 0 on success or -1 on error.
 .Sh SEE ALSO
 .Xr duplocale 3 ,
 .Xr localeconv 3 ,
 .Xr newlocale 3 ,
 .Xr querylocale 3 ,
 .Xr uselocale 3 ,
 .Xr xlocale 3
 .Sh STANDARDS
-The
-.Fn freelocale
-function
-differs from
-.St -p1003.1-2008
-in that its return type is
-.Vt int
-rather than
-.Vt void .
+This function conforms to
+.St -p1003.1-2008 .
Index: user/alc/PQ_LAUNDRY/lib/libc/locale/xlocale.c
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libc/locale/xlocale.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/lib/libc/locale/xlocale.c	(revision 303517)
@@ -1,369 +1,367 @@
 /*-
  * Copyright (c) 2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by David Chisnall under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <pthread.h>
 #include <stdio.h>
 #include <string.h>
 #include <runetype.h>
 #include "libc_private.h"
 #include "xlocale_private.h"
 
 /**
  * Each locale loader declares a global component.  This is used by setlocale()
  * and also by xlocale with LC_GLOBAL_LOCALE..
  */
 extern struct xlocale_component __xlocale_global_collate;
 extern struct xlocale_component __xlocale_global_ctype;
 extern struct xlocale_component __xlocale_global_monetary;
 extern struct xlocale_component __xlocale_global_numeric;
 extern struct xlocale_component __xlocale_global_time;
 extern struct xlocale_component __xlocale_global_messages;
 /*
  * And another version for the statically-allocated C locale.  We only have
  * components for the parts that are expected to be sensible.
  */
 extern struct xlocale_component __xlocale_C_collate;
 extern struct xlocale_component __xlocale_C_ctype;
 
 #ifndef __NO_TLS
 /*
  * The locale for this thread.
  */
 _Thread_local locale_t __thread_locale;
 #endif
 /*
  * Flag indicating that one or more per-thread locales exist.
  */
 int __has_thread_locale;
 /*
  * Private functions in setlocale.c.
  */
 const char *
 __get_locale_env(int category);
 int
 __detect_path_locale(void);
 
 struct _xlocale __xlocale_global_locale = {
 	{0},
 	{
 		&__xlocale_global_collate,
 		&__xlocale_global_ctype,
 		&__xlocale_global_monetary,
 		&__xlocale_global_numeric,
 		&__xlocale_global_time,
 		&__xlocale_global_messages
 	},
 	1,
 	0,
 	1,
 	0
 };
 
 struct _xlocale __xlocale_C_locale = {
 	{0},
 	{
 		&__xlocale_C_collate,
 		&__xlocale_C_ctype,
 		0, 0, 0, 0
 	},
 	1,
 	0,
 	1,
 	0
 };
 
 static void*(*constructors[])(const char*, locale_t) =
 {
 	__collate_load,
 	__ctype_load,
 	__monetary_load,
 	__numeric_load,
 	__time_load,
 	__messages_load
 };
 
 static pthread_key_t locale_info_key;
 static int fake_tls;
 static locale_t thread_local_locale;
 
 static void init_key(void)
 {
 
 	pthread_key_create(&locale_info_key, xlocale_release);
 	pthread_setspecific(locale_info_key, (void*)42);
 	if (pthread_getspecific(locale_info_key) == (void*)42) {
 		pthread_setspecific(locale_info_key, 0);
 	} else {
 		fake_tls = 1;
 	}
 	/* At least one per-thread locale has now been set. */
 	__has_thread_locale = 1;
 	__detect_path_locale();
 }
 
 static pthread_once_t once_control = PTHREAD_ONCE_INIT;
 
 static locale_t
 get_thread_locale(void)
 {
 
 	_once(&once_control, init_key);
 	
 	return (fake_tls ? thread_local_locale :
 		pthread_getspecific(locale_info_key));
 }
 
 #ifdef __NO_TLS
 locale_t
 __get_locale(void)
 {
 	locale_t l = get_thread_locale();
 	return (l ? l : &__xlocale_global_locale);
 
 }
 #endif
 
 static void
 set_thread_locale(locale_t loc)
 {
 	locale_t l = (loc == LC_GLOBAL_LOCALE) ? 0 : loc;
 
 	_once(&once_control, init_key);
 	
 	if (NULL != l) {
 		xlocale_retain((struct xlocale_refcounted*)l);
 	}
 	locale_t old = pthread_getspecific(locale_info_key);
 	if ((NULL != old) && (l != old)) {
 		xlocale_release((struct xlocale_refcounted*)old);
 	}
 	if (fake_tls) {
 		thread_local_locale = l;
 	} else {
 		pthread_setspecific(locale_info_key, l);
 	}
 #ifndef __NO_TLS
 	__thread_locale = l;
 	__set_thread_rune_locale(loc);
 #endif
 }
 
 /**
  * Clean up a locale, once its reference count reaches zero.  This function is
  * called by xlocale_release(), it should not be called directly.
  */
 static void
 destruct_locale(void *l)
 {
 	locale_t loc = l;
 
 	for (int type=0 ; type<XLC_LAST ; type++) {
 		if (loc->components[type]) {
 			xlocale_release(loc->components[type]);
 		}
 	}
 	if (loc->csym) {
 		free(loc->csym);
 	}
 	free(l);
 }
 
 /**
  * Allocates a new, uninitialised, locale.
  */
 static locale_t
 alloc_locale(void)
 {
 	locale_t new = calloc(sizeof(struct _xlocale), 1);
 
 	new->header.destructor = destruct_locale;
 	new->monetary_locale_changed = 1;
 	new->numeric_locale_changed = 1;
 	return (new);
 }
 static void
 copyflags(locale_t new, locale_t old)
 {
 	new->using_monetary_locale = old->using_monetary_locale;
 	new->using_numeric_locale = old->using_numeric_locale;
 	new->using_time_locale = old->using_time_locale;
 	new->using_messages_locale = old->using_messages_locale;
 }
 
 static int dupcomponent(int type, locale_t base, locale_t new) 
 {
 	/* Always copy from the global locale, since it has mutable components.
 	 */
 	struct xlocale_component *src = base->components[type];
 
 	if (&__xlocale_global_locale == base) {
 		new->components[type] = constructors[type](src->locale, new);
 		if (new->components[type]) {
 			strncpy(new->components[type]->locale, src->locale,
 			    ENCODING_LEN);
 		}
 	} else if (base->components[type]) {
 		new->components[type] = xlocale_retain(base->components[type]);
 	} else {
 		/* If the component was NULL, return success - if base is a
 		 * valid locale then the flag indicating that this isn't
 		 * present should be set.  If it isn't a valid locale, then
 		 * we're stuck anyway. */
 		return 1;
 	}
 	return (0 != new->components[type]);
 }
 
 /*
  * Public interfaces.  These are the five public functions described by the
  * xlocale interface.  
  */
 
 locale_t newlocale(int mask, const char *locale, locale_t base)
 {
 	int type;
 	const char *realLocale = locale;
 	int useenv = 0;
 	int success = 1;
 
 	_once(&once_control, init_key);
 
 	locale_t new = alloc_locale();
 	if (NULL == new) {
 		return (NULL);
 	}
 
 	FIX_LOCALE(base);
 	copyflags(new, base);
 
 	if (NULL == locale) {
 		realLocale = "C";
 	} else if ('\0' == locale[0]) {
 		useenv = 1;
 	}
 
 	for (type=0 ; type<XLC_LAST ; type++) {
 		if (mask & 1) {
 			if (useenv) {
 				realLocale = __get_locale_env(type);
 			}
 			new->components[type] =
 			     constructors[type](realLocale, new);
 			if (new->components[type]) {
 				strncpy(new->components[type]->locale,
 				     realLocale, ENCODING_LEN);
 			} else {
 				success = 0;
 				break;
 			}
 		} else {
 			if (!dupcomponent(type, base, new)) {
 				success = 0;
 				break;
 			}
 		}
 		mask >>= 1;
 	}
 	if (0 == success) {
 		xlocale_release(new);
 		new = NULL;
 	}
 
 	return (new);
 }
 
 locale_t duplocale(locale_t base)
 {
 	locale_t new = alloc_locale();
 	int type;
 
 	_once(&once_control, init_key);
 
 	if (NULL == new) {
 		return (NULL);
 	}
 	
 	FIX_LOCALE(base);
 	copyflags(new, base);
 
 	for (type=0 ; type<XLC_LAST ; type++) {
 		dupcomponent(type, base, new);
 	}
 
 	return (new);
 }
 
 /*
  * Free a locale_t.  This is quite a poorly named function.  It actually
  * disclaims a reference to a locale_t, rather than freeing it.  
  */
-int
+void
 freelocale(locale_t loc)
 {
-	/* Fail if we're passed something that isn't a locale. */
-	if ((NULL == loc) || (LC_GLOBAL_LOCALE == loc)) {
-		return (-1);
-	}
-	/* If we're passed the global locale, pretend that we freed it but don't
-	 * actually do anything. */
-	if (&__xlocale_global_locale == loc) {
-		return (0);
-	}
-	xlocale_release(loc);
-	return (0);
+
+	/*
+	 * Fail if we're passed something that isn't a locale. If we're
+	 * passed the global locale, pretend that we freed it but don't
+	 * actually do anything.
+	 */
+	if (loc != NULL && loc != LC_GLOBAL_LOCALE &&
+	    loc != &__xlocale_global_locale)
+		xlocale_release(loc);
 }
 
 /*
  * Returns the name of the locale for a particular component of a locale_t.
  */
 const char *querylocale(int mask, locale_t loc)
 {
 	int type = ffs(mask) - 1;
 	FIX_LOCALE(loc);
 	if (type >= XLC_LAST)
 		return (NULL);
 	if (loc->components[type])
 		return (loc->components[type]->locale);
 	return ("C");
 }
 
 /*
  * Installs the specified locale_t as this thread's locale.
  */
 locale_t uselocale(locale_t loc)
 {
 	locale_t old = get_thread_locale();
 	if (NULL != loc) {
 		set_thread_locale(loc);
 	}
 	return (old ? old : LC_GLOBAL_LOCALE);
 }
 
Index: user/alc/PQ_LAUNDRY/share/man/man4/if_ntb.4
===================================================================
--- user/alc/PQ_LAUNDRY/share/man/man4/if_ntb.4	(revision 303516)
+++ user/alc/PQ_LAUNDRY/share/man/man4/if_ntb.4	(revision 303517)
@@ -1,86 +1,89 @@
 .\"
 .\" Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 10, 2016
+.Dd July 29, 2016
 .Dt IF_NTB 4
 .Os
 .Sh NAME
 .Nm if_ntb
 .Nd Virtual Ethernet interface for Non-Transparent Bridges
 .Sh SYNOPSIS
 To compile this driver into your kernel,
 place the following lines in your kernel configuration file:
 .Bd -ragged -offset indent
 .Cd "device if_ntb"
 .Ed
 .Pp
 Or, to load the driver as a module at boot, place the following line in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 if_ntb_load="YES"
 .Ed
 .Pp
 The following tunables are settable from the
 .Xr loader 8 :
 .Bl -ohang
 .It Va hw.if_ntb.num_queues
 Number of transport queues to use per interface.
-Default is 1.
+Default is unlimited.
 .El
 .Sh DESCRIPTION
 The
 .Nm
 driver attaches on top of the
 .Xr ntb_transport 4
 driver to utilize its resources to create virtual Ethernet interface between
 the systems.
 Interface capabilities depend on the underlying transport.
 Typical MTU is about 64KB to reduce overhead.
 By default one queue is used, but more may be configured.
 The MAC address for interface is randomly generated.
 .Pp
 The
 .Nm
 driver does not implement any real hardware offload, but since PCIe link is
 protected by CRC32, in some situations it may be possible to save some CPU
 cycles by enabling fake checksum offload on both link sides via setting
 .Cm rxcsum
 and
 .Cm txcsum
 interface options.
 .Sh SEE ALSO
 .Xr ntb_transport 4
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 driver was developed by Intel and originally written by
 .An Carl Delsey Aq Mt carl@FreeBSD.org .
 Later improvements were done by
 .An Conrad E. Meyer Aq Mt cem@FreeBSD.org
 and
 .An Alexander Motin Aq Mt mav@FreeBSD.org .
+.Sh BUGS
+Linux supports only one queue per interface, so manual configuration
+may be required for compatibility.
Index: user/alc/PQ_LAUNDRY/share/man/man4/ntb_transport.4
===================================================================
--- user/alc/PQ_LAUNDRY/share/man/man4/ntb_transport.4	(revision 303516)
+++ user/alc/PQ_LAUNDRY/share/man/man4/ntb_transport.4	(revision 303517)
@@ -1,74 +1,79 @@
 .\"
 .\" Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 10, 2016
+.Dd July 29, 2016
 .Dt NTB_TRANSPORT 4
 .Os
 .Sh NAME
 .Nm ntb_transport
 .Nd Packet-oriented transport for Non-Transparent Bridges
 .Sh SYNOPSIS
 To load the driver as a module at boot, place the following line in
 .Xr loader.conf 5 :
 .Bd -literal -offset indent
 ntb_transport_load="YES"
 .Ed
 .Pp
 The following tunables are settable from the
 .Xr loader 8 :
 .Bl -ohang
 .It Va hw.ntb_transport.debug_level
 Driver debug level.
 The default value is 0, higher means more verbose.
-.It Va hw.ntb_transport.max_num_clients
-Number of bidirectional queues to setup.
-The default value is 0, that means one queue per available memory window.
-Maximal number is limited by number of doorbells.
+.It Va hint.ntb_transport. Ns Ar X Ns Va .config
+Configures queues allocation for consumer devices, separated by commas.
+Each device can be configured as: "<name>[:<queues>]", where:
+.Va name
+is a name of the driver which should attach the device (empty means any),
+.Va queues
+is a number of queues to allocate (empty means automatic),
+The default configuration is empty string, which means single device
+with one queue per memory window allowing any driver attachment.
 .El
 .Sh DESCRIPTION
 The
 .Nm
 driver attaches on top of the
 .Nm ntb
 driver to utilize its resources to create set of bidirectional queues,
 delivering packets between the systems.
 The primary purpose of this is to be used by
 .Nm if_ntb
 network interface, but other consumers may also be developed using KPI.
 .Sh SEE ALSO
 .Xr if_ntb 4 ,
 .Xr ntb_hw 4
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 driver was developed by Intel and originally written by
 .An Carl Delsey Aq Mt carl@FreeBSD.org .
 Later improvements were done by
 .An Conrad E. Meyer Aq Mt cem@FreeBSD.org
 and
 .An Alexander Motin Aq Mt mav@FreeBSD.org .
Index: user/alc/PQ_LAUNDRY/sys/dev/ntb/if_ntb/if_ntb.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/ntb/if_ntb/if_ntb.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/dev/ntb/if_ntb/if_ntb.c	(revision 303517)
@@ -1,516 +1,517 @@
 /*-
  * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The Non-Transparent Bridge (NTB) is a device that allows you to connect
  * two or more systems using a PCI-e links, providing remote memory access.
  *
  * This module contains a driver for simulated Ethernet device, using
  * underlying NTB Transport device.
  *
  * NOTE: Much of the code in this module is shared with Linux. Any patches may
  * be picked up and redistributed in Linux with a dual GPL/BSD license.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/buf_ring.h>
 #include <sys/bus.h>
 #include <sys/limits.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 
 #include <machine/bus.h>
 
 #include "../ntb_transport.h"
 
 #define KTR_NTB KTR_SPARE3
 #define NTB_MEDIATYPE		 (IFM_ETHER | IFM_AUTO | IFM_FDX)
 
 #define	NTB_CSUM_FEATURES	(CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP)
 #define	NTB_CSUM_FEATURES6	(CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6)
 #define	NTB_CSUM_SET		(CSUM_DATA_VALID | CSUM_DATA_VALID_IPV6 | \
 				    CSUM_PSEUDO_HDR | \
 				    CSUM_IP_CHECKED | CSUM_IP_VALID | \
 				    CSUM_SCTP_VALID)
 
 static SYSCTL_NODE(_hw, OID_AUTO, if_ntb, CTLFLAG_RW, 0, "if_ntb");
 
-static unsigned g_if_ntb_num_queues = 1;
+static unsigned g_if_ntb_num_queues = UINT_MAX;
 SYSCTL_UINT(_hw_if_ntb, OID_AUTO, num_queues, CTLFLAG_RWTUN,
     &g_if_ntb_num_queues, 0, "Number of queues per interface");
 
 struct ntb_net_queue {
 	struct ntb_net_ctx	*sc;
 	if_t			 ifp;
 	struct ntb_transport_qp *qp;
 	struct buf_ring		*br;
 	struct task		 tx_task;
 	struct taskqueue	*tx_tq;
 	struct mtx		 tx_lock;
 	struct callout		 queue_full;
 };
 
 struct ntb_net_ctx {
 	if_t			 ifp;
 	struct ifmedia		 media;
 	u_char			 eaddr[ETHER_ADDR_LEN];
 	int			 num_queues;
 	struct ntb_net_queue	*queues;
 	int			 mtu;
 };
 
 static int ntb_net_probe(device_t dev);
 static int ntb_net_attach(device_t dev);
 static int ntb_net_detach(device_t dev);
 static void ntb_net_init(void *arg);
 static int ntb_ifmedia_upd(struct ifnet *);
 static void ntb_ifmedia_sts(struct ifnet *, struct ifmediareq *);
 static int ntb_ioctl(if_t ifp, u_long command, caddr_t data);
 static int ntb_transmit(if_t ifp, struct mbuf *m);
 static void ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
     void *data, int len);
 static void ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data,
     void *data, int len);
 static void ntb_net_event_handler(void *data, enum ntb_link_event status);
 static void ntb_handle_tx(void *arg, int pending);
 static void ntb_qp_full(void *arg);
 static void ntb_qflush(if_t ifp);
 static void create_random_local_eui48(u_char *eaddr);
 
 static int
 ntb_net_probe(device_t dev)
 {
 
 	device_set_desc(dev, "NTB Network Interface");
 	return (0);
 }
 
 static int
 ntb_net_attach(device_t dev)
 {
 	struct ntb_net_ctx *sc = device_get_softc(dev);
 	struct ntb_net_queue *q;
 	if_t ifp;
 	struct ntb_queue_handlers handlers = { ntb_net_rx_handler,
 	    ntb_net_tx_handler, ntb_net_event_handler };
 	int i;
 
 	ifp = sc->ifp = if_gethandle(IFT_ETHER);
 	if (ifp == NULL) {
 		printf("ntb: Cannot allocate ifnet structure\n");
 		return (ENOMEM);
 	}
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	if_setdev(ifp, dev);
 
-	sc->num_queues = g_if_ntb_num_queues;
+	sc->num_queues = min(g_if_ntb_num_queues,
+	    ntb_transport_queue_count(dev));
 	sc->queues = malloc(sc->num_queues * sizeof(struct ntb_net_queue),
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 	sc->mtu = INT_MAX;
 	for (i = 0; i < sc->num_queues; i++) {
 		q = &sc->queues[i];
 		q->sc = sc;
 		q->ifp = ifp;
-		q->qp = ntb_transport_create_queue(q,
-		    device_get_parent(dev), &handlers);
+		q->qp = ntb_transport_create_queue(dev, i, &handlers, q);
 		if (q->qp == NULL)
 			break;
 		sc->mtu = imin(sc->mtu, ntb_transport_max_size(q->qp));
 		mtx_init(&q->tx_lock, "ntb tx", NULL, MTX_DEF);
 		q->br = buf_ring_alloc(4096, M_DEVBUF, M_WAITOK, &q->tx_lock);
 		TASK_INIT(&q->tx_task, 0, ntb_handle_tx, q);
 		q->tx_tq = taskqueue_create_fast("ntb_txq", M_NOWAIT,
 		    taskqueue_thread_enqueue, &q->tx_tq);
 		taskqueue_start_threads(&q->tx_tq, 1, PI_NET, "%s txq%d",
 		    device_get_nameunit(dev), i);
 		callout_init(&q->queue_full, 1);
 	}
 	sc->num_queues = i;
+	device_printf(dev, "%d queue(s)\n", sc->num_queues);
 
 	if_setinitfn(ifp, ntb_net_init);
 	if_setsoftc(ifp, sc);
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 	if_setioctlfn(ifp, ntb_ioctl);
 	if_settransmitfn(ifp, ntb_transmit);
 	if_setqflushfn(ifp, ntb_qflush);
 	create_random_local_eui48(sc->eaddr);
 	ether_ifattach(ifp, sc->eaddr);
 	if_setcapabilities(ifp, IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 |
 	    IFCAP_JUMBO_MTU | IFCAP_LINKSTATE);
 	if_setcapenable(ifp, IFCAP_JUMBO_MTU | IFCAP_LINKSTATE);
 	if_setmtu(ifp, sc->mtu - ETHER_HDR_LEN);
 
 	ifmedia_init(&sc->media, IFM_IMASK, ntb_ifmedia_upd,
 	    ntb_ifmedia_sts);
 	ifmedia_add(&sc->media, NTB_MEDIATYPE, 0, NULL);
 	ifmedia_set(&sc->media, NTB_MEDIATYPE);
 
 	for (i = 0; i < sc->num_queues; i++)
 		ntb_transport_link_up(sc->queues[i].qp);
 	return (0);
 }
 
 static int
 ntb_net_detach(device_t dev)
 {
 	struct ntb_net_ctx *sc = device_get_softc(dev);
 	struct ntb_net_queue *q;
 	int i;
 
 	for (i = 0; i < sc->num_queues; i++)
 		ntb_transport_link_down(sc->queues[i].qp);
 	ether_ifdetach(sc->ifp);
 	if_free(sc->ifp);
 	ifmedia_removeall(&sc->media);
 	for (i = 0; i < sc->num_queues; i++) {
 		q = &sc->queues[i];
 		ntb_transport_free_queue(q->qp);
 		buf_ring_free(q->br, M_DEVBUF);
 		callout_drain(&q->queue_full);
 		taskqueue_drain_all(q->tx_tq);
 		mtx_destroy(&q->tx_lock);
 	}
 	free(sc->queues, M_DEVBUF);
 	return (0);
 }
 
 /* Network device interface */
 
 static void
 ntb_net_init(void *arg)
 {
 	struct ntb_net_ctx *sc = arg;
 	if_t ifp = sc->ifp;
 
 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 	if_link_state_change(ifp, ntb_transport_link_query(sc->queues[0].qp) ?
 	    LINK_STATE_UP : LINK_STATE_DOWN);
 }
 
 static int
 ntb_ioctl(if_t ifp, u_long command, caddr_t data)
 {
 	struct ntb_net_ctx *sc = if_getsoftc(ifp);
 	struct ifreq *ifr = (struct ifreq *)data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFMTU:
 	    {
 		if (ifr->ifr_mtu > sc->mtu - ETHER_HDR_LEN) {
 			error = EINVAL;
 			break;
 		}
 
 		if_setmtu(ifp, ifr->ifr_mtu);
 		break;
 	    }
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->media, command);
 		break;
 
 	case SIOCSIFCAP:
 		if (ifr->ifr_reqcap & IFCAP_RXCSUM)
 			if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
 		else
 			if_setcapenablebit(ifp, 0, IFCAP_RXCSUM);
 		if (ifr->ifr_reqcap & IFCAP_TXCSUM) {
 			if_setcapenablebit(ifp, IFCAP_TXCSUM, 0);
 			if_sethwassistbits(ifp, NTB_CSUM_FEATURES, 0);
 		} else {
 			if_setcapenablebit(ifp, 0, IFCAP_TXCSUM);
 			if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES);
 		}
 		if (ifr->ifr_reqcap & IFCAP_RXCSUM_IPV6)
 			if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
 		else
 			if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6);
 		if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6) {
 			if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0);
 			if_sethwassistbits(ifp, NTB_CSUM_FEATURES6, 0);
 		} else {
 			if_setcapenablebit(ifp, 0, IFCAP_TXCSUM_IPV6);
 			if_sethwassistbits(ifp, 0, NTB_CSUM_FEATURES6);
 		}
 		break;
 
 	default:
 		error = ether_ioctl(ifp, command, data);
 		break;
 	}
 
 	return (error);
 }
 
 static int
 ntb_ifmedia_upd(struct ifnet *ifp)
 {
 	struct ntb_net_ctx *sc = if_getsoftc(ifp);
 	struct ifmedia *ifm = &sc->media;
 
 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
 		return (EINVAL);
 
 	return (0);
 }
 
 static void
 ntb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	struct ntb_net_ctx *sc = if_getsoftc(ifp);
 
 	ifmr->ifm_status = IFM_AVALID;
 	ifmr->ifm_active = NTB_MEDIATYPE;
 	if (ntb_transport_link_query(sc->queues[0].qp))
 		ifmr->ifm_status |= IFM_ACTIVE;
 }
 
 static void
 ntb_transmit_locked(struct ntb_net_queue *q)
 {
 	if_t ifp = q->ifp;
 	struct mbuf *m;
 	int rc, len;
 	short mflags;
 
 	CTR0(KTR_NTB, "TX: ntb_transmit_locked");
 	while ((m = drbr_peek(ifp, q->br)) != NULL) {
 		CTR1(KTR_NTB, "TX: start mbuf %p", m);
 		if_etherbpfmtap(ifp, m);
 		len = m->m_pkthdr.len;
 		mflags = m->m_flags;
 		rc = ntb_transport_tx_enqueue(q->qp, m, m, len);
 		if (rc != 0) {
 			CTR2(KTR_NTB, "TX: could not tx mbuf %p: %d", m, rc);
 			if (rc == EAGAIN) {
 				drbr_putback(ifp, q->br, m);
 				callout_reset_sbt(&q->queue_full,
 				    SBT_1MS / 4, SBT_1MS / 4,
 				    ntb_qp_full, q, 0);
 			} else {
 				m_freem(m);
 				drbr_advance(ifp, q->br);
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			}
 			break;
 		}
 		drbr_advance(ifp, q->br);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if (mflags & M_MCAST)
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	}
 }
 
 static int
 ntb_transmit(if_t ifp, struct mbuf *m)
 {
 	struct ntb_net_ctx *sc = if_getsoftc(ifp);
 	struct ntb_net_queue *q;
 	int error, i;
 
 	CTR0(KTR_NTB, "TX: ntb_transmit");
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		i = m->m_pkthdr.flowid % sc->num_queues;
 	else
 		i = curcpu % sc->num_queues;
 	q = &sc->queues[i];
 
 	error = drbr_enqueue(ifp, q->br, m);
 	if (error)
 		return (error);
 
 	if (mtx_trylock(&q->tx_lock)) {
 		ntb_transmit_locked(q);
 		mtx_unlock(&q->tx_lock);
 	} else
 		taskqueue_enqueue(q->tx_tq, &q->tx_task);
 	return (0);
 }
 
 static void
 ntb_handle_tx(void *arg, int pending)
 {
 	struct ntb_net_queue *q = arg;
 
 	mtx_lock(&q->tx_lock);
 	ntb_transmit_locked(q);
 	mtx_unlock(&q->tx_lock);
 }
 
 static void
 ntb_qp_full(void *arg)
 {
 	struct ntb_net_queue *q = arg;
 
 	CTR0(KTR_NTB, "TX: qp_full callout");
 	if (ntb_transport_tx_free_entry(q->qp) > 0)
 		taskqueue_enqueue(q->tx_tq, &q->tx_task);
 	else
 		callout_schedule_sbt(&q->queue_full,
 		    SBT_1MS / 4, SBT_1MS / 4, 0);
 }
 
 static void
 ntb_qflush(if_t ifp)
 {
 	struct ntb_net_ctx *sc = if_getsoftc(ifp);
 	struct ntb_net_queue *q;
 	struct mbuf *m;
 	int i;
 
 	for (i = 0; i < sc->num_queues; i++) {
 		q = &sc->queues[i];
 		mtx_lock(&q->tx_lock);
 		while ((m = buf_ring_dequeue_sc(q->br)) != NULL)
 			m_freem(m);
 		mtx_unlock(&q->tx_lock);
 	}
 	if_qflush(ifp);
 }
 
 /* Network Device Callbacks */
 static void
 ntb_net_tx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
     int len)
 {
 
 	m_freem(data);
 	CTR1(KTR_NTB, "TX: tx_handler freeing mbuf %p", data);
 }
 
 static void
 ntb_net_rx_handler(struct ntb_transport_qp *qp, void *qp_data, void *data,
     int len)
 {
 	struct ntb_net_queue *q = qp_data;
 	struct ntb_net_ctx *sc = q->sc;
 	struct mbuf *m = data;
 	if_t ifp = q->ifp;
 	uint16_t proto;
 
 	CTR1(KTR_NTB, "RX: rx handler (%d)", len);
 	if (len < 0) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return;
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 	if (sc->num_queues > 1) {
 		m->m_pkthdr.flowid = q - sc->queues;
 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 	}
 	if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
 		m_copydata(m, 12, 2, (void *)&proto);
 		switch (ntohs(proto)) {
 		case ETHERTYPE_IP:
 			if (if_getcapenable(ifp) & IFCAP_RXCSUM) {
 				m->m_pkthdr.csum_data = 0xffff;
 				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
 			}
 			break;
 		case ETHERTYPE_IPV6:
 			if (if_getcapenable(ifp) & IFCAP_RXCSUM_IPV6) {
 				m->m_pkthdr.csum_data = 0xffff;
 				m->m_pkthdr.csum_flags = NTB_CSUM_SET;
 			}
 			break;
 		}
 	}
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_input(ifp, m);
 }
 
 static void
 ntb_net_event_handler(void *data, enum ntb_link_event status)
 {
 	struct ntb_net_queue *q = data;
 	int new_state;
 
 	switch (status) {
 	case NTB_LINK_DOWN:
 		new_state = LINK_STATE_DOWN;
 		break;
 	case NTB_LINK_UP:
 		new_state = LINK_STATE_UP;
 		break;
 	default:
 		new_state = LINK_STATE_UNKNOWN;
 		break;
 	}
 	if_link_state_change(q->ifp, new_state);
 }
 
 /* Helper functions */
 /* TODO: This too should really be part of the kernel */
 #define EUI48_MULTICAST			1 << 0
 #define EUI48_LOCALLY_ADMINISTERED	1 << 1
 static void
 create_random_local_eui48(u_char *eaddr)
 {
 	static uint8_t counter = 0;
 	uint32_t seed = ticks;
 
 	eaddr[0] = EUI48_LOCALLY_ADMINISTERED;
 	memcpy(&eaddr[1], &seed, sizeof(uint32_t));
 	eaddr[5] = counter++;
 }
 
 static device_method_t ntb_net_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,     ntb_net_probe),
 	DEVMETHOD(device_attach,    ntb_net_attach),
 	DEVMETHOD(device_detach,    ntb_net_detach),
 	DEVMETHOD_END
 };
 
 devclass_t ntb_net_devclass;
 static DEFINE_CLASS_0(ntb, ntb_net_driver, ntb_net_methods,
     sizeof(struct ntb_net_ctx));
 DRIVER_MODULE(if_ntb, ntb_transport, ntb_net_driver, ntb_net_devclass,
     NULL, NULL);
 MODULE_DEPEND(if_ntb, ntb_transport, 1, 1, 1);
 MODULE_VERSION(if_ntb, 1);
Index: user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_hw/ntb_hw.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_hw/ntb_hw.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_hw/ntb_hw.c	(revision 303517)
@@ -1,3094 +1,3095 @@
 /*-
  * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The Non-Transparent Bridge (NTB) is a device that allows you to connect
  * two or more systems using a PCI-e links, providing remote memory access.
  *
  * This module contains a driver for NTB hardware in Intel Xeon/Atom CPUs.
  *
  * NOTE: Much of the code in this module is shared with Linux. Any patches may
  * be picked up and redistributed in Linux with a dual GPL/BSD license.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/interrupt.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/pciio.h>
 #include <sys/queue.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/bus.h>
 #include <machine/intr_machdep.h>
 #include <machine/resource.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include "ntb_regs.h"
 #include "../ntb.h"
 
 #define MAX_MSIX_INTERRUPTS MAX(XEON_DB_COUNT, ATOM_DB_COUNT)
 
 #define NTB_HB_TIMEOUT		1 /* second */
 #define ATOM_LINK_RECOVERY_TIME	500 /* ms */
 #define BAR_HIGH_MASK		(~((1ull << 12) - 1))
 
 #define	NTB_MSIX_VER_GUARD	0xaabbccdd
 #define	NTB_MSIX_RECEIVED	0xe0f0e0f0
 
 /*
  * PCI constants could be somewhere more generic, but aren't defined/used in
  * pci.c.
  */
 #define	PCI_MSIX_ENTRY_SIZE		16
 #define	PCI_MSIX_ENTRY_LOWER_ADDR	0
 #define	PCI_MSIX_ENTRY_UPPER_ADDR	4
 #define	PCI_MSIX_ENTRY_DATA		8
 
 enum ntb_device_type {
 	NTB_XEON,
 	NTB_ATOM
 };
 
 /* ntb_conn_type are hardware numbers, cannot change. */
 enum ntb_conn_type {
 	NTB_CONN_TRANSPARENT = 0,
 	NTB_CONN_B2B = 1,
 	NTB_CONN_RP = 2,
 };
 
 enum ntb_b2b_direction {
 	NTB_DEV_USD = 0,
 	NTB_DEV_DSD = 1,
 };
 
 enum ntb_bar {
 	NTB_CONFIG_BAR = 0,
 	NTB_B2B_BAR_1,
 	NTB_B2B_BAR_2,
 	NTB_B2B_BAR_3,
 	NTB_MAX_BARS
 };
 
 enum {
 	NTB_MSIX_GUARD = 0,
 	NTB_MSIX_DATA0,
 	NTB_MSIX_DATA1,
 	NTB_MSIX_DATA2,
 	NTB_MSIX_OFS0,
 	NTB_MSIX_OFS1,
 	NTB_MSIX_OFS2,
 	NTB_MSIX_DONE,
 	NTB_MAX_MSIX_SPAD
 };
 
 /* Device features and workarounds */
 #define HAS_FEATURE(ntb, feature)	\
 	(((ntb)->features & (feature)) != 0)
 
 struct ntb_hw_info {
 	uint32_t		device_id;
 	const char		*desc;
 	enum ntb_device_type	type;
 	uint32_t		features;
 };
 
 struct ntb_pci_bar_info {
 	bus_space_tag_t		pci_bus_tag;
 	bus_space_handle_t	pci_bus_handle;
 	int			pci_resource_id;
 	struct resource		*pci_resource;
 	vm_paddr_t		pbase;
 	caddr_t			vbase;
 	vm_size_t		size;
 	vm_memattr_t		map_mode;
 
 	/* Configuration register offsets */
 	uint32_t		psz_off;
 	uint32_t		ssz_off;
 	uint32_t		pbarxlat_off;
 };
 
 struct ntb_int_info {
 	struct resource	*res;
 	int		rid;
 	void		*tag;
 };
 
 struct ntb_vec {
 	struct ntb_softc	*ntb;
 	uint32_t		num;
 	unsigned		masked;
 };
 
 struct ntb_reg {
 	uint32_t	ntb_ctl;
 	uint32_t	lnk_sta;
 	uint8_t		db_size;
 	unsigned	mw_bar[NTB_MAX_BARS];
 };
 
 struct ntb_alt_reg {
 	uint32_t	db_bell;
 	uint32_t	db_mask;
 	uint32_t	spad;
 };
 
 struct ntb_xlat_reg {
 	uint32_t	bar0_base;
 	uint32_t	bar2_base;
 	uint32_t	bar4_base;
 	uint32_t	bar5_base;
 
 	uint32_t	bar2_xlat;
 	uint32_t	bar4_xlat;
 	uint32_t	bar5_xlat;
 
 	uint32_t	bar2_limit;
 	uint32_t	bar4_limit;
 	uint32_t	bar5_limit;
 };
 
 struct ntb_b2b_addr {
 	uint64_t	bar0_addr;
 	uint64_t	bar2_addr64;
 	uint64_t	bar4_addr64;
 	uint64_t	bar4_addr32;
 	uint64_t	bar5_addr32;
 };
 
 struct ntb_msix_data {
 	uint32_t	nmd_ofs;
 	uint32_t	nmd_data;
 };
 
 struct ntb_softc {
 	/* ntb.c context. Do not move! Must go first! */
 	void			*ntb_store;
 
 	device_t		device;
 	enum ntb_device_type	type;
 	uint32_t		features;
 
 	struct ntb_pci_bar_info	bar_info[NTB_MAX_BARS];
 	struct ntb_int_info	int_info[MAX_MSIX_INTERRUPTS];
 	uint32_t		allocated_interrupts;
 
 	struct ntb_msix_data	peer_msix_data[XEON_NONLINK_DB_MSIX_BITS];
 	struct ntb_msix_data	msix_data[XEON_NONLINK_DB_MSIX_BITS];
 	bool			peer_msix_good;
 	bool			peer_msix_done;
 	struct ntb_pci_bar_info	*peer_lapic_bar;
 	struct callout		peer_msix_work;
 
 	struct callout		heartbeat_timer;
 	struct callout		lr_timer;
 
 	struct ntb_vec		*msix_vec;
 
 	uint32_t		ppd;
 	enum ntb_conn_type	conn_type;
 	enum ntb_b2b_direction	dev_type;
 
 	/* Offset of peer bar0 in B2B BAR */
 	uint64_t			b2b_off;
 	/* Memory window used to access peer bar0 */
 #define B2B_MW_DISABLED			UINT8_MAX
 	uint8_t				b2b_mw_idx;
 	uint32_t			msix_xlat;
 	uint8_t				msix_mw_idx;
 
 	uint8_t				mw_count;
 	uint8_t				spad_count;
 	uint8_t				db_count;
 	uint8_t				db_vec_count;
 	uint8_t				db_vec_shift;
 
 	/* Protects local db_mask. */
 #define DB_MASK_LOCK(sc)	mtx_lock_spin(&(sc)->db_mask_lock)
 #define DB_MASK_UNLOCK(sc)	mtx_unlock_spin(&(sc)->db_mask_lock)
 #define DB_MASK_ASSERT(sc,f)	mtx_assert(&(sc)->db_mask_lock, (f))
 	struct mtx			db_mask_lock;
 
 	volatile uint32_t		ntb_ctl;
 	volatile uint32_t		lnk_sta;
 
 	uint64_t			db_valid_mask;
 	uint64_t			db_link_mask;
 	uint64_t			db_mask;
 	uint64_t			fake_db_bell;	/* NTB_SB01BASE_LOCKUP*/
 
 	int				last_ts;	/* ticks @ last irq */
 
 	const struct ntb_reg		*reg;
 	const struct ntb_alt_reg	*self_reg;
 	const struct ntb_alt_reg	*peer_reg;
 	const struct ntb_xlat_reg	*xlat_reg;
 };
 
 #ifdef __i386__
 static __inline uint64_t
 bus_space_read_8(bus_space_tag_t tag, bus_space_handle_t handle,
     bus_size_t offset)
 {
 
 	return (bus_space_read_4(tag, handle, offset) |
 	    ((uint64_t)bus_space_read_4(tag, handle, offset + 4)) << 32);
 }
 
 static __inline void
 bus_space_write_8(bus_space_tag_t tag, bus_space_handle_t handle,
     bus_size_t offset, uint64_t val)
 {
 
 	bus_space_write_4(tag, handle, offset, val);
 	bus_space_write_4(tag, handle, offset + 4, val >> 32);
 }
 #endif
 
 #define intel_ntb_bar_read(SIZE, bar, offset) \
 	    bus_space_read_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
 	    ntb->bar_info[(bar)].pci_bus_handle, (offset))
 #define intel_ntb_bar_write(SIZE, bar, offset, val) \
 	    bus_space_write_ ## SIZE (ntb->bar_info[(bar)].pci_bus_tag, \
 	    ntb->bar_info[(bar)].pci_bus_handle, (offset), (val))
 #define intel_ntb_reg_read(SIZE, offset) \
 	    intel_ntb_bar_read(SIZE, NTB_CONFIG_BAR, offset)
 #define intel_ntb_reg_write(SIZE, offset, val) \
 	    intel_ntb_bar_write(SIZE, NTB_CONFIG_BAR, offset, val)
 #define intel_ntb_mw_read(SIZE, offset) \
 	    intel_ntb_bar_read(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
 		offset)
 #define intel_ntb_mw_write(SIZE, offset, val) \
 	    intel_ntb_bar_write(SIZE, intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx), \
 		offset, val)
 
 static int intel_ntb_probe(device_t device);
 static int intel_ntb_attach(device_t device);
 static int intel_ntb_detach(device_t device);
 static uint64_t intel_ntb_db_valid_mask(device_t dev);
 static void intel_ntb_spad_clear(device_t dev);
 static uint64_t intel_ntb_db_vector_mask(device_t dev, uint32_t vector);
 static bool intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed,
     enum ntb_width *width);
 static int intel_ntb_link_enable(device_t dev, enum ntb_speed speed,
     enum ntb_width width);
 static int intel_ntb_link_disable(device_t dev);
 static int intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val);
 static int intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val);
 
 static unsigned intel_ntb_user_mw_to_idx(struct ntb_softc *, unsigned uidx);
 static inline enum ntb_bar intel_ntb_mw_to_bar(struct ntb_softc *, unsigned mw);
 static inline bool bar_is_64bit(struct ntb_softc *, enum ntb_bar);
 static inline void bar_get_xlat_params(struct ntb_softc *, enum ntb_bar,
     uint32_t *base, uint32_t *xlat, uint32_t *lmt);
 static int intel_ntb_map_pci_bars(struct ntb_softc *ntb);
 static int intel_ntb_mw_set_wc_internal(struct ntb_softc *, unsigned idx,
     vm_memattr_t);
 static void print_map_success(struct ntb_softc *, struct ntb_pci_bar_info *,
     const char *);
 static int map_mmr_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar);
 static int map_memory_window_bar(struct ntb_softc *ntb,
     struct ntb_pci_bar_info *bar);
 static void intel_ntb_unmap_pci_bar(struct ntb_softc *ntb);
 static int intel_ntb_remap_msix(device_t, uint32_t desired, uint32_t avail);
 static int intel_ntb_init_isr(struct ntb_softc *ntb);
 static int intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb);
 static int intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors);
 static void intel_ntb_teardown_interrupts(struct ntb_softc *ntb);
 static inline uint64_t intel_ntb_vec_mask(struct ntb_softc *, uint64_t db_vector);
 static void intel_ntb_interrupt(struct ntb_softc *, uint32_t vec);
 static void ndev_vec_isr(void *arg);
 static void ndev_irq_isr(void *arg);
 static inline uint64_t db_ioread(struct ntb_softc *, uint64_t regoff);
 static inline void db_iowrite(struct ntb_softc *, uint64_t regoff, uint64_t);
 static inline void db_iowrite_raw(struct ntb_softc *, uint64_t regoff, uint64_t);
 static int intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors);
 static void intel_ntb_free_msix_vec(struct ntb_softc *ntb);
 static void intel_ntb_get_msix_info(struct ntb_softc *ntb);
 static void intel_ntb_exchange_msix(void *);
 static struct ntb_hw_info *intel_ntb_get_device_info(uint32_t device_id);
 static void intel_ntb_detect_max_mw(struct ntb_softc *ntb);
 static int intel_ntb_detect_xeon(struct ntb_softc *ntb);
 static int intel_ntb_detect_atom(struct ntb_softc *ntb);
 static int intel_ntb_xeon_init_dev(struct ntb_softc *ntb);
 static int intel_ntb_atom_init_dev(struct ntb_softc *ntb);
 static void intel_ntb_teardown_xeon(struct ntb_softc *ntb);
 static void configure_atom_secondary_side_bars(struct ntb_softc *ntb);
 static void xeon_reset_sbar_size(struct ntb_softc *, enum ntb_bar idx,
     enum ntb_bar regbar);
 static void xeon_set_sbar_base_and_limit(struct ntb_softc *,
     uint64_t base_addr, enum ntb_bar idx, enum ntb_bar regbar);
 static void xeon_set_pbar_xlat(struct ntb_softc *, uint64_t base_addr,
     enum ntb_bar idx);
 static int xeon_setup_b2b_mw(struct ntb_softc *,
     const struct ntb_b2b_addr *addr, const struct ntb_b2b_addr *peer_addr);
 static inline bool link_is_up(struct ntb_softc *ntb);
 static inline bool _xeon_link_is_up(struct ntb_softc *ntb);
 static inline bool atom_link_is_err(struct ntb_softc *ntb);
 static inline enum ntb_speed intel_ntb_link_sta_speed(struct ntb_softc *);
 static inline enum ntb_width intel_ntb_link_sta_width(struct ntb_softc *);
 static void atom_link_hb(void *arg);
 static void recover_atom_link(void *arg);
 static bool intel_ntb_poll_link(struct ntb_softc *ntb);
 static void save_bar_parameters(struct ntb_pci_bar_info *bar);
 static void intel_ntb_sysctl_init(struct ntb_softc *);
 static int sysctl_handle_features(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_link_status(SYSCTL_HANDLER_ARGS);
 static int sysctl_handle_register(SYSCTL_HANDLER_ARGS);
 
 static unsigned g_ntb_hw_debug_level;
 SYSCTL_UINT(_hw_ntb, OID_AUTO, debug_level, CTLFLAG_RWTUN,
     &g_ntb_hw_debug_level, 0, "ntb_hw log level -- higher is more verbose");
 #define intel_ntb_printf(lvl, ...) do {				\
 	if ((lvl) <= g_ntb_hw_debug_level) {			\
 		device_printf(ntb->device, __VA_ARGS__);	\
 	}							\
 } while (0)
 
 #define	_NTB_PAT_UC	0
 #define	_NTB_PAT_WC	1
 #define	_NTB_PAT_WT	4
 #define	_NTB_PAT_WP	5
 #define	_NTB_PAT_WB	6
 #define	_NTB_PAT_UCM	7
 static unsigned g_ntb_mw_pat = _NTB_PAT_UC;
 SYSCTL_UINT(_hw_ntb, OID_AUTO, default_mw_pat, CTLFLAG_RDTUN,
     &g_ntb_mw_pat, 0, "Configure the default memory window cache flags (PAT): "
     "UC: "  __XSTRING(_NTB_PAT_UC) ", "
     "WC: "  __XSTRING(_NTB_PAT_WC) ", "
     "WT: "  __XSTRING(_NTB_PAT_WT) ", "
     "WP: "  __XSTRING(_NTB_PAT_WP) ", "
     "WB: "  __XSTRING(_NTB_PAT_WB) ", "
     "UC-: " __XSTRING(_NTB_PAT_UCM));
 
 static inline vm_memattr_t
 intel_ntb_pat_flags(void)
 {
 
 	switch (g_ntb_mw_pat) {
 	case _NTB_PAT_WC:
 		return (VM_MEMATTR_WRITE_COMBINING);
 	case _NTB_PAT_WT:
 		return (VM_MEMATTR_WRITE_THROUGH);
 	case _NTB_PAT_WP:
 		return (VM_MEMATTR_WRITE_PROTECTED);
 	case _NTB_PAT_WB:
 		return (VM_MEMATTR_WRITE_BACK);
 	case _NTB_PAT_UCM:
 		return (VM_MEMATTR_WEAK_UNCACHEABLE);
 	case _NTB_PAT_UC:
 		/* FALLTHROUGH */
 	default:
 		return (VM_MEMATTR_UNCACHEABLE);
 	}
 }
 
 /*
  * Well, this obviously doesn't belong here, but it doesn't seem to exist
  * anywhere better yet.
  */
 static inline const char *
 intel_ntb_vm_memattr_to_str(vm_memattr_t pat)
 {
 
 	switch (pat) {
 	case VM_MEMATTR_WRITE_COMBINING:
 		return ("WRITE_COMBINING");
 	case VM_MEMATTR_WRITE_THROUGH:
 		return ("WRITE_THROUGH");
 	case VM_MEMATTR_WRITE_PROTECTED:
 		return ("WRITE_PROTECTED");
 	case VM_MEMATTR_WRITE_BACK:
 		return ("WRITE_BACK");
 	case VM_MEMATTR_WEAK_UNCACHEABLE:
 		return ("UNCACHED");
 	case VM_MEMATTR_UNCACHEABLE:
 		return ("UNCACHEABLE");
 	default:
 		return ("UNKNOWN");
 	}
 }
 
 static int g_ntb_msix_idx = 1;
 SYSCTL_INT(_hw_ntb, OID_AUTO, msix_mw_idx, CTLFLAG_RDTUN, &g_ntb_msix_idx,
     0, "Use this memory window to access the peer MSIX message complex on "
     "certain Xeon-based NTB systems, as a workaround for a hardware errata.  "
     "Like b2b_mw_idx, negative values index from the last available memory "
     "window.  (Applies on Xeon platforms with SB01BASE_LOCKUP errata.)");
 
 static int g_ntb_mw_idx = -1;
 SYSCTL_INT(_hw_ntb, OID_AUTO, b2b_mw_idx, CTLFLAG_RDTUN, &g_ntb_mw_idx,
     0, "Use this memory window to access the peer NTB registers.  A "
     "non-negative value starts from the first MW index; a negative value "
     "starts from the last MW index.  The default is -1, i.e., the last "
     "available memory window.  Both sides of the NTB MUST set the same "
     "value here!  (Applies on Xeon platforms with SDOORBELL_LOCKUP errata.)");
 
 /* Hardware owns the low 16 bits of features. */
 #define NTB_BAR_SIZE_4K		(1 << 0)
 #define NTB_SDOORBELL_LOCKUP	(1 << 1)
 #define NTB_SB01BASE_LOCKUP	(1 << 2)
 #define NTB_B2BDOORBELL_BIT14	(1 << 3)
 /* Software/configuration owns the top 16 bits. */
 #define NTB_SPLIT_BAR		(1ull << 16)
 
 #define NTB_FEATURES_STR \
     "\20\21SPLIT_BAR4\04B2B_DOORBELL_BIT14\03SB01BASE_LOCKUP" \
     "\02SDOORBELL_LOCKUP\01BAR_SIZE_4K"
 
 static struct ntb_hw_info pci_ids[] = {
 	/* XXX: PS/SS IDs left out until they are supported. */
 	{ 0x0C4E8086, "BWD Atom Processor S1200 Non-Transparent Bridge B2B",
 		NTB_ATOM, 0 },
 
 	{ 0x37258086, "JSF Xeon C35xx/C55xx Non-Transparent Bridge B2B",
 		NTB_XEON, NTB_SDOORBELL_LOCKUP | NTB_B2BDOORBELL_BIT14 },
 	{ 0x3C0D8086, "SNB Xeon E5/Core i7 Non-Transparent Bridge B2B",
 		NTB_XEON, NTB_SDOORBELL_LOCKUP | NTB_B2BDOORBELL_BIT14 },
 	{ 0x0E0D8086, "IVT Xeon E5 V2 Non-Transparent Bridge B2B", NTB_XEON,
 		NTB_SDOORBELL_LOCKUP | NTB_B2BDOORBELL_BIT14 |
 		    NTB_SB01BASE_LOCKUP | NTB_BAR_SIZE_4K },
 	{ 0x2F0D8086, "HSX Xeon E5 V3 Non-Transparent Bridge B2B", NTB_XEON,
 		NTB_SDOORBELL_LOCKUP | NTB_B2BDOORBELL_BIT14 |
 		    NTB_SB01BASE_LOCKUP },
 	{ 0x6F0D8086, "BDX Xeon E5 V4 Non-Transparent Bridge B2B", NTB_XEON,
 		NTB_SDOORBELL_LOCKUP | NTB_B2BDOORBELL_BIT14 |
 		    NTB_SB01BASE_LOCKUP },
 
 	{ 0x00000000, NULL, NTB_ATOM, 0 }
 };
 
 static const struct ntb_reg atom_reg = {
 	.ntb_ctl = ATOM_NTBCNTL_OFFSET,
 	.lnk_sta = ATOM_LINK_STATUS_OFFSET,
 	.db_size = sizeof(uint64_t),
 	.mw_bar = { NTB_B2B_BAR_1, NTB_B2B_BAR_2 },
 };
 
 static const struct ntb_alt_reg atom_pri_reg = {
 	.db_bell = ATOM_PDOORBELL_OFFSET,
 	.db_mask = ATOM_PDBMSK_OFFSET,
 	.spad = ATOM_SPAD_OFFSET,
 };
 
 static const struct ntb_alt_reg atom_b2b_reg = {
 	.db_bell = ATOM_B2B_DOORBELL_OFFSET,
 	.spad = ATOM_B2B_SPAD_OFFSET,
 };
 
 static const struct ntb_xlat_reg atom_sec_xlat = {
 #if 0
 	/* "FIXME" says the Linux driver. */
 	.bar0_base = ATOM_SBAR0BASE_OFFSET,
 	.bar2_base = ATOM_SBAR2BASE_OFFSET,
 	.bar4_base = ATOM_SBAR4BASE_OFFSET,
 
 	.bar2_limit = ATOM_SBAR2LMT_OFFSET,
 	.bar4_limit = ATOM_SBAR4LMT_OFFSET,
 #endif
 
 	.bar2_xlat = ATOM_SBAR2XLAT_OFFSET,
 	.bar4_xlat = ATOM_SBAR4XLAT_OFFSET,
 };
 
 static const struct ntb_reg xeon_reg = {
 	.ntb_ctl = XEON_NTBCNTL_OFFSET,
 	.lnk_sta = XEON_LINK_STATUS_OFFSET,
 	.db_size = sizeof(uint16_t),
 	.mw_bar = { NTB_B2B_BAR_1, NTB_B2B_BAR_2, NTB_B2B_BAR_3 },
 };
 
 static const struct ntb_alt_reg xeon_pri_reg = {
 	.db_bell = XEON_PDOORBELL_OFFSET,
 	.db_mask = XEON_PDBMSK_OFFSET,
 	.spad = XEON_SPAD_OFFSET,
 };
 
 static const struct ntb_alt_reg xeon_b2b_reg = {
 	.db_bell = XEON_B2B_DOORBELL_OFFSET,
 	.spad = XEON_B2B_SPAD_OFFSET,
 };
 
 static const struct ntb_xlat_reg xeon_sec_xlat = {
 	.bar0_base = XEON_SBAR0BASE_OFFSET,
 	.bar2_base = XEON_SBAR2BASE_OFFSET,
 	.bar4_base = XEON_SBAR4BASE_OFFSET,
 	.bar5_base = XEON_SBAR5BASE_OFFSET,
 
 	.bar2_limit = XEON_SBAR2LMT_OFFSET,
 	.bar4_limit = XEON_SBAR4LMT_OFFSET,
 	.bar5_limit = XEON_SBAR5LMT_OFFSET,
 
 	.bar2_xlat = XEON_SBAR2XLAT_OFFSET,
 	.bar4_xlat = XEON_SBAR4XLAT_OFFSET,
 	.bar5_xlat = XEON_SBAR5XLAT_OFFSET,
 };
 
 static struct ntb_b2b_addr xeon_b2b_usd_addr = {
 	.bar0_addr = XEON_B2B_BAR0_ADDR,
 	.bar2_addr64 = XEON_B2B_BAR2_ADDR64,
 	.bar4_addr64 = XEON_B2B_BAR4_ADDR64,
 	.bar4_addr32 = XEON_B2B_BAR4_ADDR32,
 	.bar5_addr32 = XEON_B2B_BAR5_ADDR32,
 };
 
 static struct ntb_b2b_addr xeon_b2b_dsd_addr = {
 	.bar0_addr = XEON_B2B_BAR0_ADDR,
 	.bar2_addr64 = XEON_B2B_BAR2_ADDR64,
 	.bar4_addr64 = XEON_B2B_BAR4_ADDR64,
 	.bar4_addr32 = XEON_B2B_BAR4_ADDR32,
 	.bar5_addr32 = XEON_B2B_BAR5_ADDR32,
 };
 
 SYSCTL_NODE(_hw_ntb, OID_AUTO, xeon_b2b, CTLFLAG_RW, 0,
     "B2B MW segment overrides -- MUST be the same on both sides");
 
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, usd_bar2_addr64, CTLFLAG_RDTUN,
     &xeon_b2b_usd_addr.bar2_addr64, 0, "If using B2B topology on Xeon "
     "hardware, use this 64-bit address on the bus between the NTB devices for "
     "the window at BAR2, on the upstream side of the link.  MUST be the same "
     "address on both sides.");
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, usd_bar4_addr64, CTLFLAG_RDTUN,
     &xeon_b2b_usd_addr.bar4_addr64, 0, "See usd_bar2_addr64, but BAR4.");
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, usd_bar4_addr32, CTLFLAG_RDTUN,
     &xeon_b2b_usd_addr.bar4_addr32, 0, "See usd_bar2_addr64, but BAR4 "
     "(split-BAR mode).");
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, usd_bar5_addr32, CTLFLAG_RDTUN,
     &xeon_b2b_usd_addr.bar5_addr32, 0, "See usd_bar2_addr64, but BAR5 "
     "(split-BAR mode).");
 
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, dsd_bar2_addr64, CTLFLAG_RDTUN,
     &xeon_b2b_dsd_addr.bar2_addr64, 0, "If using B2B topology on Xeon "
     "hardware, use this 64-bit address on the bus between the NTB devices for "
     "the window at BAR2, on the downstream side of the link.  MUST be the same"
     " address on both sides.");
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, dsd_bar4_addr64, CTLFLAG_RDTUN,
     &xeon_b2b_dsd_addr.bar4_addr64, 0, "See dsd_bar2_addr64, but BAR4.");
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, dsd_bar4_addr32, CTLFLAG_RDTUN,
     &xeon_b2b_dsd_addr.bar4_addr32, 0, "See dsd_bar2_addr64, but BAR4 "
     "(split-BAR mode).");
 SYSCTL_UQUAD(_hw_ntb_xeon_b2b, OID_AUTO, dsd_bar5_addr32, CTLFLAG_RDTUN,
     &xeon_b2b_dsd_addr.bar5_addr32, 0, "See dsd_bar2_addr64, but BAR5 "
     "(split-BAR mode).");
 
 /*
  * OS <-> Driver interface structures
  */
 MALLOC_DEFINE(M_NTB, "ntb_hw", "ntb_hw driver memory allocations");
 
 /*
  * OS <-> Driver linkage functions
  */
 static int
 intel_ntb_probe(device_t device)
 {
 	struct ntb_hw_info *p;
 
 	p = intel_ntb_get_device_info(pci_get_devid(device));
 	if (p == NULL)
 		return (ENXIO);
 
 	device_set_desc(device, p->desc);
 	return (0);
 }
 
 static int
 intel_ntb_attach(device_t device)
 {
 	struct ntb_softc *ntb;
 	struct ntb_hw_info *p;
 	int error;
 
 	ntb = device_get_softc(device);
 	p = intel_ntb_get_device_info(pci_get_devid(device));
 
 	ntb->device = device;
 	ntb->type = p->type;
 	ntb->features = p->features;
 	ntb->b2b_mw_idx = B2B_MW_DISABLED;
 	ntb->msix_mw_idx = B2B_MW_DISABLED;
 
 	/* Heartbeat timer for NTB_ATOM since there is no link interrupt */
 	callout_init(&ntb->heartbeat_timer, 1);
 	callout_init(&ntb->lr_timer, 1);
 	callout_init(&ntb->peer_msix_work, 1);
 	mtx_init(&ntb->db_mask_lock, "ntb hw bits", NULL, MTX_SPIN);
 
 	if (ntb->type == NTB_ATOM)
 		error = intel_ntb_detect_atom(ntb);
 	else
 		error = intel_ntb_detect_xeon(ntb);
 	if (error != 0)
 		goto out;
 
 	intel_ntb_detect_max_mw(ntb);
 
 	pci_enable_busmaster(ntb->device);
 
 	error = intel_ntb_map_pci_bars(ntb);
 	if (error != 0)
 		goto out;
 	if (ntb->type == NTB_ATOM)
 		error = intel_ntb_atom_init_dev(ntb);
 	else
 		error = intel_ntb_xeon_init_dev(ntb);
 	if (error != 0)
 		goto out;
 
 	intel_ntb_spad_clear(device);
 
 	intel_ntb_poll_link(ntb);
 
 	intel_ntb_sysctl_init(ntb);
 
 	/* Attach children to this controller */
 	error = ntb_register_device(device);
 
 out:
 	if (error != 0)
 		intel_ntb_detach(device);
 	return (error);
 }
 
 static int
 intel_ntb_detach(device_t device)
 {
 	struct ntb_softc *ntb;
 
 	ntb = device_get_softc(device);
 
 	/* Detach & delete all children */
 	ntb_unregister_device(device);
 
 	if (ntb->self_reg != NULL) {
 		DB_MASK_LOCK(ntb);
 		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_valid_mask);
 		DB_MASK_UNLOCK(ntb);
 	}
 	callout_drain(&ntb->heartbeat_timer);
 	callout_drain(&ntb->lr_timer);
 	callout_drain(&ntb->peer_msix_work);
 	pci_disable_busmaster(ntb->device);
 	if (ntb->type == NTB_XEON)
 		intel_ntb_teardown_xeon(ntb);
 	intel_ntb_teardown_interrupts(ntb);
 
 	mtx_destroy(&ntb->db_mask_lock);
 
 	intel_ntb_unmap_pci_bar(ntb);
 
 	return (0);
 }
 
 /*
  * Driver internal routines
  */
 static inline enum ntb_bar
 intel_ntb_mw_to_bar(struct ntb_softc *ntb, unsigned mw)
 {
 
 	KASSERT(mw < ntb->mw_count,
 	    ("%s: mw:%u > count:%u", __func__, mw, (unsigned)ntb->mw_count));
 	KASSERT(ntb->reg->mw_bar[mw] != 0, ("invalid mw"));
 
 	return (ntb->reg->mw_bar[mw]);
 }
 
 static inline bool
 bar_is_64bit(struct ntb_softc *ntb, enum ntb_bar bar)
 {
 	/* XXX This assertion could be stronger. */
 	KASSERT(bar < NTB_MAX_BARS, ("bogus bar"));
 	return (bar < NTB_B2B_BAR_2 || !HAS_FEATURE(ntb, NTB_SPLIT_BAR));
 }
 
 static inline void
 bar_get_xlat_params(struct ntb_softc *ntb, enum ntb_bar bar, uint32_t *base,
     uint32_t *xlat, uint32_t *lmt)
 {
 	uint32_t basev, lmtv, xlatv;
 
 	switch (bar) {
 	case NTB_B2B_BAR_1:
 		basev = ntb->xlat_reg->bar2_base;
 		lmtv = ntb->xlat_reg->bar2_limit;
 		xlatv = ntb->xlat_reg->bar2_xlat;
 		break;
 	case NTB_B2B_BAR_2:
 		basev = ntb->xlat_reg->bar4_base;
 		lmtv = ntb->xlat_reg->bar4_limit;
 		xlatv = ntb->xlat_reg->bar4_xlat;
 		break;
 	case NTB_B2B_BAR_3:
 		basev = ntb->xlat_reg->bar5_base;
 		lmtv = ntb->xlat_reg->bar5_limit;
 		xlatv = ntb->xlat_reg->bar5_xlat;
 		break;
 	default:
 		KASSERT(bar >= NTB_B2B_BAR_1 && bar < NTB_MAX_BARS,
 		    ("bad bar"));
 		basev = lmtv = xlatv = 0;
 		break;
 	}
 
 	if (base != NULL)
 		*base = basev;
 	if (xlat != NULL)
 		*xlat = xlatv;
 	if (lmt != NULL)
 		*lmt = lmtv;
 }
 
 static int
 intel_ntb_map_pci_bars(struct ntb_softc *ntb)
 {
 	int rc;
 
 	ntb->bar_info[NTB_CONFIG_BAR].pci_resource_id = PCIR_BAR(0);
 	rc = map_mmr_bar(ntb, &ntb->bar_info[NTB_CONFIG_BAR]);
 	if (rc != 0)
 		goto out;
 
 	ntb->bar_info[NTB_B2B_BAR_1].pci_resource_id = PCIR_BAR(2);
 	rc = map_memory_window_bar(ntb, &ntb->bar_info[NTB_B2B_BAR_1]);
 	if (rc != 0)
 		goto out;
 	ntb->bar_info[NTB_B2B_BAR_1].psz_off = XEON_PBAR23SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_1].ssz_off = XEON_SBAR23SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_1].pbarxlat_off = XEON_PBAR2XLAT_OFFSET;
 
 	ntb->bar_info[NTB_B2B_BAR_2].pci_resource_id = PCIR_BAR(4);
 	rc = map_memory_window_bar(ntb, &ntb->bar_info[NTB_B2B_BAR_2]);
 	if (rc != 0)
 		goto out;
 	ntb->bar_info[NTB_B2B_BAR_2].psz_off = XEON_PBAR4SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_2].ssz_off = XEON_SBAR4SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off = XEON_PBAR4XLAT_OFFSET;
 
 	if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		goto out;
 
 	ntb->bar_info[NTB_B2B_BAR_3].pci_resource_id = PCIR_BAR(5);
 	rc = map_memory_window_bar(ntb, &ntb->bar_info[NTB_B2B_BAR_3]);
 	ntb->bar_info[NTB_B2B_BAR_3].psz_off = XEON_PBAR5SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_3].ssz_off = XEON_SBAR5SZ_OFFSET;
 	ntb->bar_info[NTB_B2B_BAR_3].pbarxlat_off = XEON_PBAR5XLAT_OFFSET;
 
 out:
 	if (rc != 0)
 		device_printf(ntb->device,
 		    "unable to allocate pci resource\n");
 	return (rc);
 }
 
 static void
 print_map_success(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar,
     const char *kind)
 {
 
 	device_printf(ntb->device,
 	    "Mapped BAR%d v:[%p-%p] p:[%p-%p] (0x%jx bytes) (%s)\n",
 	    PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
 	    (char *)bar->vbase + bar->size - 1,
 	    (void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
 	    (uintmax_t)bar->size, kind);
 }
 
 static int
 map_mmr_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 {
 
 	bar->pci_resource = bus_alloc_resource_any(ntb->device, SYS_RES_MEMORY,
 	    &bar->pci_resource_id, RF_ACTIVE);
 	if (bar->pci_resource == NULL)
 		return (ENXIO);
 
 	save_bar_parameters(bar);
 	bar->map_mode = VM_MEMATTR_UNCACHEABLE;
 	print_map_success(ntb, bar, "mmr");
 	return (0);
 }
 
 static int
 map_memory_window_bar(struct ntb_softc *ntb, struct ntb_pci_bar_info *bar)
 {
 	int rc;
 	vm_memattr_t mapmode;
 	uint8_t bar_size_bits = 0;
 
 	bar->pci_resource = bus_alloc_resource_any(ntb->device, SYS_RES_MEMORY,
 	    &bar->pci_resource_id, RF_ACTIVE);
 
 	if (bar->pci_resource == NULL)
 		return (ENXIO);
 
 	save_bar_parameters(bar);
 	/*
 	 * Ivytown NTB BAR sizes are misreported by the hardware due to a
 	 * hardware issue. To work around this, query the size it should be
 	 * configured to by the device and modify the resource to correspond to
 	 * this new size. The BIOS on systems with this problem is required to
 	 * provide enough address space to allow the driver to make this change
 	 * safely.
 	 *
 	 * Ideally I could have just specified the size when I allocated the
 	 * resource like:
 	 *  bus_alloc_resource(ntb->device,
 	 *	SYS_RES_MEMORY, &bar->pci_resource_id, 0ul, ~0ul,
 	 *	1ul << bar_size_bits, RF_ACTIVE);
 	 * but the PCI driver does not honor the size in this call, so we have
 	 * to modify it after the fact.
 	 */
 	if (HAS_FEATURE(ntb, NTB_BAR_SIZE_4K)) {
 		if (bar->pci_resource_id == PCIR_BAR(2))
 			bar_size_bits = pci_read_config(ntb->device,
 			    XEON_PBAR23SZ_OFFSET, 1);
 		else
 			bar_size_bits = pci_read_config(ntb->device,
 			    XEON_PBAR45SZ_OFFSET, 1);
 
 		rc = bus_adjust_resource(ntb->device, SYS_RES_MEMORY,
 		    bar->pci_resource, bar->pbase,
 		    bar->pbase + (1ul << bar_size_bits) - 1);
 		if (rc != 0) {
 			device_printf(ntb->device,
 			    "unable to resize bar\n");
 			return (rc);
 		}
 
 		save_bar_parameters(bar);
 	}
 
 	bar->map_mode = VM_MEMATTR_UNCACHEABLE;
 	print_map_success(ntb, bar, "mw");
 
 	/*
 	 * Optionally, mark MW BARs as anything other than UC to improve
 	 * performance.
 	 */
 	mapmode = intel_ntb_pat_flags();
 	if (mapmode == bar->map_mode)
 		return (0);
 
 	rc = pmap_change_attr((vm_offset_t)bar->vbase, bar->size, mapmode);
 	if (rc == 0) {
 		bar->map_mode = mapmode;
 		device_printf(ntb->device,
 		    "Marked BAR%d v:[%p-%p] p:[%p-%p] as "
 		    "%s.\n",
 		    PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
 		    (char *)bar->vbase + bar->size - 1,
 		    (void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
 		    intel_ntb_vm_memattr_to_str(mapmode));
 	} else
 		device_printf(ntb->device,
 		    "Unable to mark BAR%d v:[%p-%p] p:[%p-%p] as "
 		    "%s: %d\n",
 		    PCI_RID2BAR(bar->pci_resource_id), bar->vbase,
 		    (char *)bar->vbase + bar->size - 1,
 		    (void *)bar->pbase, (void *)(bar->pbase + bar->size - 1),
 		    intel_ntb_vm_memattr_to_str(mapmode), rc);
 		/* Proceed anyway */
 	return (0);
 }
 
 static void
 intel_ntb_unmap_pci_bar(struct ntb_softc *ntb)
 {
 	struct ntb_pci_bar_info *current_bar;
 	int i;
 
 	for (i = 0; i < NTB_MAX_BARS; i++) {
 		current_bar = &ntb->bar_info[i];
 		if (current_bar->pci_resource != NULL)
 			bus_release_resource(ntb->device, SYS_RES_MEMORY,
 			    current_bar->pci_resource_id,
 			    current_bar->pci_resource);
 	}
 }
 
 static int
 intel_ntb_setup_msix(struct ntb_softc *ntb, uint32_t num_vectors)
 {
 	uint32_t i;
 	int rc;
 
 	for (i = 0; i < num_vectors; i++) {
 		ntb->int_info[i].rid = i + 1;
 		ntb->int_info[i].res = bus_alloc_resource_any(ntb->device,
 		    SYS_RES_IRQ, &ntb->int_info[i].rid, RF_ACTIVE);
 		if (ntb->int_info[i].res == NULL) {
 			device_printf(ntb->device,
 			    "bus_alloc_resource failed\n");
 			return (ENOMEM);
 		}
 		ntb->int_info[i].tag = NULL;
 		ntb->allocated_interrupts++;
 		rc = bus_setup_intr(ntb->device, ntb->int_info[i].res,
 		    INTR_MPSAFE | INTR_TYPE_MISC, NULL, ndev_vec_isr,
 		    &ntb->msix_vec[i], &ntb->int_info[i].tag);
 		if (rc != 0) {
 			device_printf(ntb->device, "bus_setup_intr failed\n");
 			return (ENXIO);
 		}
 	}
 	return (0);
 }
 
 /*
  * The Linux NTB driver drops from MSI-X to legacy INTx if a unique vector
  * cannot be allocated for each MSI-X message.  JHB seems to think remapping
  * should be okay.  This tunable should enable us to test that hypothesis
  * when someone gets their hands on some Xeon hardware.
  */
 static int ntb_force_remap_mode;
 SYSCTL_INT(_hw_ntb, OID_AUTO, force_remap_mode, CTLFLAG_RDTUN,
     &ntb_force_remap_mode, 0, "If enabled, force MSI-X messages to be remapped"
     " to a smaller number of ithreads, even if the desired number are "
     "available");
 
 /*
  * In case it is NOT ok, give consumers an abort button.
  */
 static int ntb_prefer_intx;
 SYSCTL_INT(_hw_ntb, OID_AUTO, prefer_intx_to_remap, CTLFLAG_RDTUN,
     &ntb_prefer_intx, 0, "If enabled, prefer to use legacy INTx mode rather "
     "than remapping MSI-X messages over available slots (match Linux driver "
     "behavior)");
 
 /*
  * Remap the desired number of MSI-X messages to available ithreads in a simple
  * round-robin fashion.
  */
 static int
 intel_ntb_remap_msix(device_t dev, uint32_t desired, uint32_t avail)
 {
 	u_int *vectors;
 	uint32_t i;
 	int rc;
 
 	if (ntb_prefer_intx != 0)
 		return (ENXIO);
 
 	vectors = malloc(desired * sizeof(*vectors), M_NTB, M_ZERO | M_WAITOK);
 
 	for (i = 0; i < desired; i++)
 		vectors[i] = (i % avail) + 1;
 
 	rc = pci_remap_msix(dev, desired, vectors);
 	free(vectors, M_NTB);
 	return (rc);
 }
 
 static int
 intel_ntb_init_isr(struct ntb_softc *ntb)
 {
 	uint32_t desired_vectors, num_vectors;
 	int rc;
 
 	ntb->allocated_interrupts = 0;
 	ntb->last_ts = ticks;
 
 	/*
 	 * Mask all doorbell interrupts.  (Except link events!)
 	 */
 	DB_MASK_LOCK(ntb);
 	ntb->db_mask = ntb->db_valid_mask;
 	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	DB_MASK_UNLOCK(ntb);
 
 	num_vectors = desired_vectors = MIN(pci_msix_count(ntb->device),
 	    ntb->db_count);
 	if (desired_vectors >= 1) {
 		rc = pci_alloc_msix(ntb->device, &num_vectors);
 
 		if (ntb_force_remap_mode != 0 && rc == 0 &&
 		    num_vectors == desired_vectors)
 			num_vectors--;
 
 		if (rc == 0 && num_vectors < desired_vectors) {
 			rc = intel_ntb_remap_msix(ntb->device, desired_vectors,
 			    num_vectors);
 			if (rc == 0)
 				num_vectors = desired_vectors;
 			else
 				pci_release_msi(ntb->device);
 		}
 		if (rc != 0)
 			num_vectors = 1;
 	} else
 		num_vectors = 1;
 
 	if (ntb->type == NTB_XEON && num_vectors < ntb->db_vec_count) {
 		if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			device_printf(ntb->device,
 			    "Errata workaround does not support MSI or INTX\n");
 			return (EINVAL);
 		}
 
 		ntb->db_vec_count = 1;
 		ntb->db_vec_shift = XEON_DB_TOTAL_SHIFT;
 		rc = intel_ntb_setup_legacy_interrupt(ntb);
 	} else {
 		if (num_vectors - 1 != XEON_NONLINK_DB_MSIX_BITS &&
 		    HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			device_printf(ntb->device,
 			    "Errata workaround expects %d doorbell bits\n",
 			    XEON_NONLINK_DB_MSIX_BITS);
 			return (EINVAL);
 		}
 
 		intel_ntb_create_msix_vec(ntb, num_vectors);
 		rc = intel_ntb_setup_msix(ntb, num_vectors);
 	}
 	if (rc != 0) {
 		device_printf(ntb->device,
 		    "Error allocating interrupts: %d\n", rc);
 		intel_ntb_free_msix_vec(ntb);
 	}
 
 	return (rc);
 }
 
 static int
 intel_ntb_setup_legacy_interrupt(struct ntb_softc *ntb)
 {
 	int rc;
 
 	ntb->int_info[0].rid = 0;
 	ntb->int_info[0].res = bus_alloc_resource_any(ntb->device, SYS_RES_IRQ,
 	    &ntb->int_info[0].rid, RF_SHAREABLE|RF_ACTIVE);
 	if (ntb->int_info[0].res == NULL) {
 		device_printf(ntb->device, "bus_alloc_resource failed\n");
 		return (ENOMEM);
 	}
 
 	ntb->int_info[0].tag = NULL;
 	ntb->allocated_interrupts = 1;
 
 	rc = bus_setup_intr(ntb->device, ntb->int_info[0].res,
 	    INTR_MPSAFE | INTR_TYPE_MISC, NULL, ndev_irq_isr,
 	    ntb, &ntb->int_info[0].tag);
 	if (rc != 0) {
 		device_printf(ntb->device, "bus_setup_intr failed\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 intel_ntb_teardown_interrupts(struct ntb_softc *ntb)
 {
 	struct ntb_int_info *current_int;
 	int i;
 
 	for (i = 0; i < ntb->allocated_interrupts; i++) {
 		current_int = &ntb->int_info[i];
 		if (current_int->tag != NULL)
 			bus_teardown_intr(ntb->device, current_int->res,
 			    current_int->tag);
 
 		if (current_int->res != NULL)
 			bus_release_resource(ntb->device, SYS_RES_IRQ,
 			    rman_get_rid(current_int->res), current_int->res);
 	}
 
 	intel_ntb_free_msix_vec(ntb);
 	pci_release_msi(ntb->device);
 }
 
 /*
  * Doorbell register and mask are 64-bit on Atom, 16-bit on Xeon.  Abstract it
  * out to make code clearer.
  */
 static inline uint64_t
 db_ioread(struct ntb_softc *ntb, uint64_t regoff)
 {
 
 	if (ntb->type == NTB_ATOM)
 		return (intel_ntb_reg_read(8, regoff));
 
 	KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
 
 	return (intel_ntb_reg_read(2, regoff));
 }
 
 static inline void
 db_iowrite(struct ntb_softc *ntb, uint64_t regoff, uint64_t val)
 {
 
 	KASSERT((val & ~ntb->db_valid_mask) == 0,
 	    ("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
 	     (uintmax_t)(val & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
 	if (regoff == ntb->self_reg->db_mask)
 		DB_MASK_ASSERT(ntb, MA_OWNED);
 	db_iowrite_raw(ntb, regoff, val);
 }
 
 static inline void
 db_iowrite_raw(struct ntb_softc *ntb, uint64_t regoff, uint64_t val)
 {
 
 	if (ntb->type == NTB_ATOM) {
 		intel_ntb_reg_write(8, regoff, val);
 		return;
 	}
 
 	KASSERT(ntb->type == NTB_XEON, ("bad ntb type"));
 	intel_ntb_reg_write(2, regoff, (uint16_t)val);
 }
 
 static void
 intel_ntb_db_set_mask(device_t dev, uint64_t bits)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	DB_MASK_LOCK(ntb);
 	ntb->db_mask |= bits;
 	if (!HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
 		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	DB_MASK_UNLOCK(ntb);
 }
 
 static void
 intel_ntb_db_clear_mask(device_t dev, uint64_t bits)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	uint64_t ibits;
 	int i;
 
 	KASSERT((bits & ~ntb->db_valid_mask) == 0,
 	    ("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
 	DB_MASK_LOCK(ntb);
 	ibits = ntb->fake_db_bell & ntb->db_mask & bits;
 	ntb->db_mask &= ~bits;
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		/* Simulate fake interrupts if unmasked DB bits are set. */
 		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
 			if ((ibits & intel_ntb_db_vector_mask(dev, i)) != 0)
 				swi_sched(ntb->int_info[i].tag, 0);
 		}
 	} else {
 		db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	}
 	DB_MASK_UNLOCK(ntb);
 }
 
 static uint64_t
 intel_ntb_db_read(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
 		return (ntb->fake_db_bell);
 
 	return (db_ioread(ntb, ntb->self_reg->db_bell));
 }
 
 static void
 intel_ntb_db_clear(device_t dev, uint64_t bits)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	KASSERT((bits & ~ntb->db_valid_mask) == 0,
 	    ("%s: Invalid bits 0x%jx (valid: 0x%jx)", __func__,
 	     (uintmax_t)(bits & ~ntb->db_valid_mask),
 	     (uintmax_t)ntb->db_valid_mask));
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		DB_MASK_LOCK(ntb);
 		ntb->fake_db_bell &= ~bits;
 		DB_MASK_UNLOCK(ntb);
 		return;
 	}
 
 	db_iowrite(ntb, ntb->self_reg->db_bell, bits);
 }
 
 static inline uint64_t
 intel_ntb_vec_mask(struct ntb_softc *ntb, uint64_t db_vector)
 {
 	uint64_t shift, mask;
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		/*
 		 * Remap vectors in custom way to make at least first
 		 * three doorbells to not generate stray events.
 		 * This breaks Linux compatibility (if one existed)
 		 * when more then one DB is used (not by if_ntb).
 		 */
 		if (db_vector < XEON_NONLINK_DB_MSIX_BITS - 1)
 			return (1 << db_vector);
 		if (db_vector == XEON_NONLINK_DB_MSIX_BITS - 1)
 			return (0x7ffc);
 	}
 
 	shift = ntb->db_vec_shift;
 	mask = (1ull << shift) - 1;
 	return (mask << (shift * db_vector));
 }
 
 static void
 intel_ntb_interrupt(struct ntb_softc *ntb, uint32_t vec)
 {
 	uint64_t vec_mask;
 
 	ntb->last_ts = ticks;
 	vec_mask = intel_ntb_vec_mask(ntb, vec);
 
 	if ((vec_mask & ntb->db_link_mask) != 0) {
 		if (intel_ntb_poll_link(ntb))
 			ntb_link_event(ntb->device);
 	}
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
 	    (vec_mask & ntb->db_link_mask) == 0) {
 		DB_MASK_LOCK(ntb);
 
 		/* Do not report same DB events again if not cleared yet. */
 		vec_mask &= ~ntb->fake_db_bell;
 
 		/* Update our internal doorbell register. */
 		ntb->fake_db_bell |= vec_mask;
 
 		/* Do not report masked DB events. */
 		vec_mask &= ~ntb->db_mask;
 
 		DB_MASK_UNLOCK(ntb);
 	}
 
 	if ((vec_mask & ntb->db_valid_mask) != 0)
 		ntb_db_event(ntb->device, vec);
 }
 
 static void
 ndev_vec_isr(void *arg)
 {
 	struct ntb_vec *nvec = arg;
 
 	intel_ntb_interrupt(nvec->ntb, nvec->num);
 }
 
 static void
 ndev_irq_isr(void *arg)
 {
 	/* If we couldn't set up MSI-X, we only have the one vector. */
 	intel_ntb_interrupt(arg, 0);
 }
 
 static int
 intel_ntb_create_msix_vec(struct ntb_softc *ntb, uint32_t num_vectors)
 {
 	uint32_t i;
 
 	ntb->msix_vec = malloc(num_vectors * sizeof(*ntb->msix_vec), M_NTB,
 	    M_ZERO | M_WAITOK);
 	for (i = 0; i < num_vectors; i++) {
 		ntb->msix_vec[i].num = i;
 		ntb->msix_vec[i].ntb = ntb;
 	}
 
 	return (0);
 }
 
 static void
 intel_ntb_free_msix_vec(struct ntb_softc *ntb)
 {
 
 	if (ntb->msix_vec == NULL)
 		return;
 
 	free(ntb->msix_vec, M_NTB);
 	ntb->msix_vec = NULL;
 }
 
 static void
 intel_ntb_get_msix_info(struct ntb_softc *ntb)
 {
 	struct pci_devinfo *dinfo;
 	struct pcicfg_msix *msix;
 	uint32_t laddr, data, i, offset;
 
 	dinfo = device_get_ivars(ntb->device);
 	msix = &dinfo->cfg.msix;
 
 	CTASSERT(XEON_NONLINK_DB_MSIX_BITS == nitems(ntb->msix_data));
 
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
 		offset = msix->msix_table_offset + i * PCI_MSIX_ENTRY_SIZE;
 
 		laddr = bus_read_4(msix->msix_table_res, offset +
 		    PCI_MSIX_ENTRY_LOWER_ADDR);
 		intel_ntb_printf(2, "local MSIX addr(%u): 0x%x\n", i, laddr);
 
 		KASSERT((laddr & MSI_INTEL_ADDR_BASE) == MSI_INTEL_ADDR_BASE,
 		    ("local MSIX addr 0x%x not in MSI base 0x%x", laddr,
 		     MSI_INTEL_ADDR_BASE));
 		ntb->msix_data[i].nmd_ofs = laddr;
 
 		data = bus_read_4(msix->msix_table_res, offset +
 		    PCI_MSIX_ENTRY_DATA);
 		intel_ntb_printf(2, "local MSIX data(%u): 0x%x\n", i, data);
 
 		ntb->msix_data[i].nmd_data = data;
 	}
 }
 
 static struct ntb_hw_info *
 intel_ntb_get_device_info(uint32_t device_id)
 {
 	struct ntb_hw_info *ep = pci_ids;
 
 	while (ep->device_id) {
 		if (ep->device_id == device_id)
 			return (ep);
 		++ep;
 	}
 	return (NULL);
 }
 
 static void
 intel_ntb_teardown_xeon(struct ntb_softc *ntb)
 {
 
 	if (ntb->reg != NULL)
 		intel_ntb_link_disable(ntb->device);
 }
 
 static void
 intel_ntb_detect_max_mw(struct ntb_softc *ntb)
 {
 
 	if (ntb->type == NTB_ATOM) {
 		ntb->mw_count = ATOM_MW_COUNT;
 		return;
 	}
 
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		ntb->mw_count = XEON_HSX_SPLIT_MW_COUNT;
 	else
 		ntb->mw_count = XEON_SNB_MW_COUNT;
 }
 
 static int
 intel_ntb_detect_xeon(struct ntb_softc *ntb)
 {
 	uint8_t ppd, conn_type;
 
 	ppd = pci_read_config(ntb->device, NTB_PPD_OFFSET, 1);
 	ntb->ppd = ppd;
 
 	if ((ppd & XEON_PPD_DEV_TYPE) != 0)
 		ntb->dev_type = NTB_DEV_DSD;
 	else
 		ntb->dev_type = NTB_DEV_USD;
 
 	if ((ppd & XEON_PPD_SPLIT_BAR) != 0)
 		ntb->features |= NTB_SPLIT_BAR;
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP) &&
 	    !HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		device_printf(ntb->device,
 		    "Can not apply SB01BASE_LOCKUP workaround "
 		    "with split BARs disabled!\n");
 		device_printf(ntb->device,
 		    "Expect system hangs under heavy NTB traffic!\n");
 		ntb->features &= ~NTB_SB01BASE_LOCKUP;
 	}
 
 	/*
 	 * SDOORBELL errata workaround gets in the way of SB01BASE_LOCKUP
 	 * errata workaround; only do one at a time.
 	 */
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP))
 		ntb->features &= ~NTB_SDOORBELL_LOCKUP;
 
 	conn_type = ppd & XEON_PPD_CONN_TYPE;
 	switch (conn_type) {
 	case NTB_CONN_B2B:
 		ntb->conn_type = conn_type;
 		break;
 	case NTB_CONN_RP:
 	case NTB_CONN_TRANSPARENT:
 	default:
 		device_printf(ntb->device, "Unsupported connection type: %u\n",
 		    (unsigned)conn_type);
 		return (ENXIO);
 	}
 	return (0);
 }
 
 static int
 intel_ntb_detect_atom(struct ntb_softc *ntb)
 {
 	uint32_t ppd, conn_type;
 
 	ppd = pci_read_config(ntb->device, NTB_PPD_OFFSET, 4);
 	ntb->ppd = ppd;
 
 	if ((ppd & ATOM_PPD_DEV_TYPE) != 0)
 		ntb->dev_type = NTB_DEV_DSD;
 	else
 		ntb->dev_type = NTB_DEV_USD;
 
 	conn_type = (ppd & ATOM_PPD_CONN_TYPE) >> 8;
 	switch (conn_type) {
 	case NTB_CONN_B2B:
 		ntb->conn_type = conn_type;
 		break;
 	default:
 		device_printf(ntb->device, "Unsupported NTB configuration\n");
 		return (ENXIO);
 	}
 	return (0);
 }
 
 static int
 intel_ntb_xeon_init_dev(struct ntb_softc *ntb)
 {
 	int rc;
 
 	ntb->spad_count		= XEON_SPAD_COUNT;
 	ntb->db_count		= XEON_DB_COUNT;
 	ntb->db_link_mask	= XEON_DB_LINK_BIT;
 	ntb->db_vec_count	= XEON_DB_MSIX_VECTOR_COUNT;
 	ntb->db_vec_shift	= XEON_DB_MSIX_VECTOR_SHIFT;
 
 	if (ntb->conn_type != NTB_CONN_B2B) {
 		device_printf(ntb->device, "Connection type %d not supported\n",
 		    ntb->conn_type);
 		return (ENXIO);
 	}
 
 	ntb->reg = &xeon_reg;
 	ntb->self_reg = &xeon_pri_reg;
 	ntb->peer_reg = &xeon_b2b_reg;
 	ntb->xlat_reg = &xeon_sec_xlat;
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		ntb->fake_db_bell = 0;
 		ntb->msix_mw_idx = (ntb->mw_count + g_ntb_msix_idx) %
 		    ntb->mw_count;
 		intel_ntb_printf(2, "Setting up MSIX mw idx %d means %u\n",
 		    g_ntb_msix_idx, ntb->msix_mw_idx);
 		rc = intel_ntb_mw_set_wc_internal(ntb, ntb->msix_mw_idx,
 		    VM_MEMATTR_UNCACHEABLE);
 		KASSERT(rc == 0, ("shouldn't fail"));
 	} else if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
 		/*
 		 * There is a Xeon hardware errata related to writes to SDOORBELL or
 		 * B2BDOORBELL in conjunction with inbound access to NTB MMIO space,
 		 * which may hang the system.  To workaround this, use a memory
 		 * window to access the interrupt and scratch pad registers on the
 		 * remote system.
 		 */
 		ntb->b2b_mw_idx = (ntb->mw_count + g_ntb_mw_idx) %
 		    ntb->mw_count;
 		intel_ntb_printf(2, "Setting up b2b mw idx %d means %u\n",
 		    g_ntb_mw_idx, ntb->b2b_mw_idx);
 		rc = intel_ntb_mw_set_wc_internal(ntb, ntb->b2b_mw_idx,
 		    VM_MEMATTR_UNCACHEABLE);
 		KASSERT(rc == 0, ("shouldn't fail"));
 	} else if (HAS_FEATURE(ntb, NTB_B2BDOORBELL_BIT14))
 		/*
 		 * HW Errata on bit 14 of b2bdoorbell register.  Writes will not be
 		 * mirrored to the remote system.  Shrink the number of bits by one,
 		 * since bit 14 is the last bit.
 		 *
 		 * On REGS_THRU_MW errata mode, we don't use the b2bdoorbell register
 		 * anyway.  Nor for non-B2B connection types.
 		 */
 		ntb->db_count = XEON_DB_COUNT - 1;
 
 	ntb->db_valid_mask = (1ull << ntb->db_count) - 1;
 
 	if (ntb->dev_type == NTB_DEV_USD)
 		rc = xeon_setup_b2b_mw(ntb, &xeon_b2b_dsd_addr,
 		    &xeon_b2b_usd_addr);
 	else
 		rc = xeon_setup_b2b_mw(ntb, &xeon_b2b_usd_addr,
 		    &xeon_b2b_dsd_addr);
 	if (rc != 0)
 		return (rc);
 
 	/* Enable Bus Master and Memory Space on the secondary side */
 	intel_ntb_reg_write(2, XEON_SPCICMD_OFFSET,
 	    PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
 	/*
 	 * Mask all doorbell interrupts.
 	 */
 	DB_MASK_LOCK(ntb);
 	ntb->db_mask = ntb->db_valid_mask;
 	db_iowrite(ntb, ntb->self_reg->db_mask, ntb->db_mask);
 	DB_MASK_UNLOCK(ntb);
 
 	rc = intel_ntb_init_isr(ntb);
 	return (rc);
 }
 
 static int
 intel_ntb_atom_init_dev(struct ntb_softc *ntb)
 {
 	int error;
 
 	KASSERT(ntb->conn_type == NTB_CONN_B2B,
 	    ("Unsupported NTB configuration (%d)\n", ntb->conn_type));
 
 	ntb->spad_count		 = ATOM_SPAD_COUNT;
 	ntb->db_count		 = ATOM_DB_COUNT;
 	ntb->db_vec_count	 = ATOM_DB_MSIX_VECTOR_COUNT;
 	ntb->db_vec_shift	 = ATOM_DB_MSIX_VECTOR_SHIFT;
 	ntb->db_valid_mask	 = (1ull << ntb->db_count) - 1;
 
 	ntb->reg = &atom_reg;
 	ntb->self_reg = &atom_pri_reg;
 	ntb->peer_reg = &atom_b2b_reg;
 	ntb->xlat_reg = &atom_sec_xlat;
 
 	/*
 	 * FIXME - MSI-X bug on early Atom HW, remove once internal issue is
 	 * resolved.  Mask transaction layer internal parity errors.
 	 */
 	pci_write_config(ntb->device, 0xFC, 0x4, 4);
 
 	configure_atom_secondary_side_bars(ntb);
 
 	/* Enable Bus Master and Memory Space on the secondary side */
 	intel_ntb_reg_write(2, ATOM_SPCICMD_OFFSET,
 	    PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
 
 	error = intel_ntb_init_isr(ntb);
 	if (error != 0)
 		return (error);
 
 	/* Initiate PCI-E link training */
 	intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
 
 	callout_reset(&ntb->heartbeat_timer, 0, atom_link_hb, ntb);
 
 	return (0);
 }
 
 /* XXX: Linux driver doesn't seem to do any of this for Atom. */
 static void
 configure_atom_secondary_side_bars(struct ntb_softc *ntb)
 {
 
 	if (ntb->dev_type == NTB_DEV_USD) {
 		intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
 		    XEON_B2B_BAR2_ADDR64);
 		intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
 		    XEON_B2B_BAR4_ADDR64);
 		intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
 		intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
 	} else {
 		intel_ntb_reg_write(8, ATOM_PBAR2XLAT_OFFSET,
 		    XEON_B2B_BAR2_ADDR64);
 		intel_ntb_reg_write(8, ATOM_PBAR4XLAT_OFFSET,
 		    XEON_B2B_BAR4_ADDR64);
 		intel_ntb_reg_write(8, ATOM_MBAR23_OFFSET, XEON_B2B_BAR2_ADDR64);
 		intel_ntb_reg_write(8, ATOM_MBAR45_OFFSET, XEON_B2B_BAR4_ADDR64);
 	}
 }
 
 
 /*
  * When working around Xeon SDOORBELL errata by remapping remote registers in a
  * MW, limit the B2B MW to half a MW.  By sharing a MW, half the shared MW
  * remains for use by a higher layer.
  *
  * Will only be used if working around SDOORBELL errata and the BIOS-configured
  * MW size is sufficiently large.
  */
 static unsigned int ntb_b2b_mw_share;
 SYSCTL_UINT(_hw_ntb, OID_AUTO, b2b_mw_share, CTLFLAG_RDTUN, &ntb_b2b_mw_share,
     0, "If enabled (non-zero), prefer to share half of the B2B peer register "
     "MW with higher level consumers.  Both sides of the NTB MUST set the same "
     "value here.");
 
 static void
 xeon_reset_sbar_size(struct ntb_softc *ntb, enum ntb_bar idx,
     enum ntb_bar regbar)
 {
 	struct ntb_pci_bar_info *bar;
 	uint8_t bar_sz;
 
 	if (!HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_3)
 		return;
 
 	bar = &ntb->bar_info[idx];
 	bar_sz = pci_read_config(ntb->device, bar->psz_off, 1);
 	if (idx == regbar) {
 		if (ntb->b2b_off != 0)
 			bar_sz--;
 		else
 			bar_sz = 0;
 	}
 	pci_write_config(ntb->device, bar->ssz_off, bar_sz, 1);
 	bar_sz = pci_read_config(ntb->device, bar->ssz_off, 1);
 	(void)bar_sz;
 }
 
 static void
 xeon_set_sbar_base_and_limit(struct ntb_softc *ntb, uint64_t bar_addr,
     enum ntb_bar idx, enum ntb_bar regbar)
 {
 	uint64_t reg_val;
 	uint32_t base_reg, lmt_reg;
 
 	bar_get_xlat_params(ntb, idx, &base_reg, NULL, &lmt_reg);
 	if (idx == regbar) {
 		if (ntb->b2b_off)
 			bar_addr += ntb->b2b_off;
 		else
 			bar_addr = 0;
 	}
 
 	if (!bar_is_64bit(ntb, idx)) {
 		intel_ntb_reg_write(4, base_reg, bar_addr);
 		reg_val = intel_ntb_reg_read(4, base_reg);
 		(void)reg_val;
 
 		intel_ntb_reg_write(4, lmt_reg, bar_addr);
 		reg_val = intel_ntb_reg_read(4, lmt_reg);
 		(void)reg_val;
 	} else {
 		intel_ntb_reg_write(8, base_reg, bar_addr);
 		reg_val = intel_ntb_reg_read(8, base_reg);
 		(void)reg_val;
 
 		intel_ntb_reg_write(8, lmt_reg, bar_addr);
 		reg_val = intel_ntb_reg_read(8, lmt_reg);
 		(void)reg_val;
 	}
 }
 
 static void
 xeon_set_pbar_xlat(struct ntb_softc *ntb, uint64_t base_addr, enum ntb_bar idx)
 {
 	struct ntb_pci_bar_info *bar;
 
 	bar = &ntb->bar_info[idx];
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR) && idx >= NTB_B2B_BAR_2) {
 		intel_ntb_reg_write(4, bar->pbarxlat_off, base_addr);
 		base_addr = intel_ntb_reg_read(4, bar->pbarxlat_off);
 	} else {
 		intel_ntb_reg_write(8, bar->pbarxlat_off, base_addr);
 		base_addr = intel_ntb_reg_read(8, bar->pbarxlat_off);
 	}
 	(void)base_addr;
 }
 
 static int
 xeon_setup_b2b_mw(struct ntb_softc *ntb, const struct ntb_b2b_addr *addr,
     const struct ntb_b2b_addr *peer_addr)
 {
 	struct ntb_pci_bar_info *b2b_bar;
 	vm_size_t bar_size;
 	uint64_t bar_addr;
 	enum ntb_bar b2b_bar_num, i;
 
 	if (ntb->b2b_mw_idx == B2B_MW_DISABLED) {
 		b2b_bar = NULL;
 		b2b_bar_num = NTB_CONFIG_BAR;
 		ntb->b2b_off = 0;
 	} else {
 		b2b_bar_num = intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx);
 		KASSERT(b2b_bar_num > 0 && b2b_bar_num < NTB_MAX_BARS,
 		    ("invalid b2b mw bar"));
 
 		b2b_bar = &ntb->bar_info[b2b_bar_num];
 		bar_size = b2b_bar->size;
 
 		if (ntb_b2b_mw_share != 0 &&
 		    (bar_size >> 1) >= XEON_B2B_MIN_SIZE)
 			ntb->b2b_off = bar_size >> 1;
 		else if (bar_size >= XEON_B2B_MIN_SIZE) {
 			ntb->b2b_off = 0;
 		} else {
 			device_printf(ntb->device,
 			    "B2B bar size is too small!\n");
 			return (EIO);
 		}
 	}
 
 	/*
 	 * Reset the secondary bar sizes to match the primary bar sizes.
 	 * (Except, disable or halve the size of the B2B secondary bar.)
 	 */
 	for (i = NTB_B2B_BAR_1; i < NTB_MAX_BARS; i++)
 		xeon_reset_sbar_size(ntb, i, b2b_bar_num);
 
 	bar_addr = 0;
 	if (b2b_bar_num == NTB_CONFIG_BAR)
 		bar_addr = addr->bar0_addr;
 	else if (b2b_bar_num == NTB_B2B_BAR_1)
 		bar_addr = addr->bar2_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		bar_addr = addr->bar4_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2)
 		bar_addr = addr->bar4_addr32;
 	else if (b2b_bar_num == NTB_B2B_BAR_3)
 		bar_addr = addr->bar5_addr32;
 	else
 		KASSERT(false, ("invalid bar"));
 
 	intel_ntb_reg_write(8, XEON_SBAR0BASE_OFFSET, bar_addr);
 
 	/*
 	 * Other SBARs are normally hit by the PBAR xlat, except for the b2b
 	 * register BAR.  The B2B BAR is either disabled above or configured
 	 * half-size.  It starts at PBAR xlat + offset.
 	 *
 	 * Also set up incoming BAR limits == base (zero length window).
 	 */
 	xeon_set_sbar_base_and_limit(ntb, addr->bar2_addr64, NTB_B2B_BAR_1,
 	    b2b_bar_num);
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		xeon_set_sbar_base_and_limit(ntb, addr->bar4_addr32,
 		    NTB_B2B_BAR_2, b2b_bar_num);
 		xeon_set_sbar_base_and_limit(ntb, addr->bar5_addr32,
 		    NTB_B2B_BAR_3, b2b_bar_num);
 	} else
 		xeon_set_sbar_base_and_limit(ntb, addr->bar4_addr64,
 		    NTB_B2B_BAR_2, b2b_bar_num);
 
 	/* Zero incoming translation addrs */
 	intel_ntb_reg_write(8, XEON_SBAR2XLAT_OFFSET, 0);
 	intel_ntb_reg_write(8, XEON_SBAR4XLAT_OFFSET, 0);
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		uint32_t xlat_reg, lmt_reg;
 		enum ntb_bar bar_num;
 
 		/*
 		 * We point the chosen MSIX MW BAR xlat to remote LAPIC for
 		 * workaround
 		 */
 		bar_num = intel_ntb_mw_to_bar(ntb, ntb->msix_mw_idx);
 		bar_get_xlat_params(ntb, bar_num, NULL, &xlat_reg, &lmt_reg);
 		if (bar_is_64bit(ntb, bar_num)) {
 			intel_ntb_reg_write(8, xlat_reg, MSI_INTEL_ADDR_BASE);
 			ntb->msix_xlat = intel_ntb_reg_read(8, xlat_reg);
 			intel_ntb_reg_write(8, lmt_reg, 0);
 		} else {
 			intel_ntb_reg_write(4, xlat_reg, MSI_INTEL_ADDR_BASE);
 			ntb->msix_xlat = intel_ntb_reg_read(4, xlat_reg);
 			intel_ntb_reg_write(4, lmt_reg, 0);
 		}
 
 		ntb->peer_lapic_bar =  &ntb->bar_info[bar_num];
 	}
 	(void)intel_ntb_reg_read(8, XEON_SBAR2XLAT_OFFSET);
 	(void)intel_ntb_reg_read(8, XEON_SBAR4XLAT_OFFSET);
 
 	/* Zero outgoing translation limits (whole bar size windows) */
 	intel_ntb_reg_write(8, XEON_PBAR2LMT_OFFSET, 0);
 	intel_ntb_reg_write(8, XEON_PBAR4LMT_OFFSET, 0);
 
 	/* Set outgoing translation offsets */
 	xeon_set_pbar_xlat(ntb, peer_addr->bar2_addr64, NTB_B2B_BAR_1);
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		xeon_set_pbar_xlat(ntb, peer_addr->bar4_addr32, NTB_B2B_BAR_2);
 		xeon_set_pbar_xlat(ntb, peer_addr->bar5_addr32, NTB_B2B_BAR_3);
 	} else
 		xeon_set_pbar_xlat(ntb, peer_addr->bar4_addr64, NTB_B2B_BAR_2);
 
 	/* Set the translation offset for B2B registers */
 	bar_addr = 0;
 	if (b2b_bar_num == NTB_CONFIG_BAR)
 		bar_addr = peer_addr->bar0_addr;
 	else if (b2b_bar_num == NTB_B2B_BAR_1)
 		bar_addr = peer_addr->bar2_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2 && !HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		bar_addr = peer_addr->bar4_addr64;
 	else if (b2b_bar_num == NTB_B2B_BAR_2)
 		bar_addr = peer_addr->bar4_addr32;
 	else if (b2b_bar_num == NTB_B2B_BAR_3)
 		bar_addr = peer_addr->bar5_addr32;
 	else
 		KASSERT(false, ("invalid bar"));
 
 	/*
 	 * B2B_XLAT_OFFSET is a 64-bit register but can only be written 32 bits
 	 * at a time.
 	 */
 	intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETL, bar_addr & 0xffffffff);
 	intel_ntb_reg_write(4, XEON_B2B_XLAT_OFFSETU, bar_addr >> 32);
 	return (0);
 }
 
 static inline bool
 _xeon_link_is_up(struct ntb_softc *ntb)
 {
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT)
 		return (true);
 	return ((ntb->lnk_sta & NTB_LINK_STATUS_ACTIVE) != 0);
 }
 
 static inline bool
 link_is_up(struct ntb_softc *ntb)
 {
 
 	if (ntb->type == NTB_XEON)
 		return (_xeon_link_is_up(ntb) && (ntb->peer_msix_good ||
 		    !HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)));
 
 	KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
 	return ((ntb->ntb_ctl & ATOM_CNTL_LINK_DOWN) == 0);
 }
 
 static inline bool
 atom_link_is_err(struct ntb_softc *ntb)
 {
 	uint32_t status;
 
 	KASSERT(ntb->type == NTB_ATOM, ("ntb type"));
 
 	status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
 	if ((status & ATOM_LTSSMSTATEJMP_FORCEDETECT) != 0)
 		return (true);
 
 	status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
 	return ((status & ATOM_IBIST_ERR_OFLOW) != 0);
 }
 
 /* Atom does not have link status interrupt, poll on that platform */
 static void
 atom_link_hb(void *arg)
 {
 	struct ntb_softc *ntb = arg;
 	sbintime_t timo, poll_ts;
 
 	timo = NTB_HB_TIMEOUT * hz;
 	poll_ts = ntb->last_ts + timo;
 
 	/*
 	 * Delay polling the link status if an interrupt was received, unless
 	 * the cached link status says the link is down.
 	 */
 	if ((sbintime_t)ticks - poll_ts < 0 && link_is_up(ntb)) {
 		timo = poll_ts - ticks;
 		goto out;
 	}
 
 	if (intel_ntb_poll_link(ntb))
 		ntb_link_event(ntb->device);
 
 	if (!link_is_up(ntb) && atom_link_is_err(ntb)) {
 		/* Link is down with error, proceed with recovery */
 		callout_reset(&ntb->lr_timer, 0, recover_atom_link, ntb);
 		return;
 	}
 
 out:
 	callout_reset(&ntb->heartbeat_timer, timo, atom_link_hb, ntb);
 }
 
 static void
 atom_perform_link_restart(struct ntb_softc *ntb)
 {
 	uint32_t status;
 
 	/* Driver resets the NTB ModPhy lanes - magic! */
 	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0xe0);
 	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x40);
 	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG4, 0x60);
 	intel_ntb_reg_write(1, ATOM_MODPHY_PCSREG6, 0x60);
 
 	/* Driver waits 100ms to allow the NTB ModPhy to settle */
 	pause("ModPhy", hz / 10);
 
 	/* Clear AER Errors, write to clear */
 	status = intel_ntb_reg_read(4, ATOM_ERRCORSTS_OFFSET);
 	status &= PCIM_AER_COR_REPLAY_ROLLOVER;
 	intel_ntb_reg_write(4, ATOM_ERRCORSTS_OFFSET, status);
 
 	/* Clear unexpected electrical idle event in LTSSM, write to clear */
 	status = intel_ntb_reg_read(4, ATOM_LTSSMERRSTS0_OFFSET);
 	status |= ATOM_LTSSMERRSTS0_UNEXPECTEDEI;
 	intel_ntb_reg_write(4, ATOM_LTSSMERRSTS0_OFFSET, status);
 
 	/* Clear DeSkew Buffer error, write to clear */
 	status = intel_ntb_reg_read(4, ATOM_DESKEWSTS_OFFSET);
 	status |= ATOM_DESKEWSTS_DBERR;
 	intel_ntb_reg_write(4, ATOM_DESKEWSTS_OFFSET, status);
 
 	status = intel_ntb_reg_read(4, ATOM_IBSTERRRCRVSTS0_OFFSET);
 	status &= ATOM_IBIST_ERR_OFLOW;
 	intel_ntb_reg_write(4, ATOM_IBSTERRRCRVSTS0_OFFSET, status);
 
 	/* Releases the NTB state machine to allow the link to retrain */
 	status = intel_ntb_reg_read(4, ATOM_LTSSMSTATEJMP_OFFSET);
 	status &= ~ATOM_LTSSMSTATEJMP_FORCEDETECT;
 	intel_ntb_reg_write(4, ATOM_LTSSMSTATEJMP_OFFSET, status);
 }
 
 static int
 intel_ntb_link_enable(device_t dev, enum ntb_speed speed __unused,
     enum ntb_width width __unused)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
 	intel_ntb_printf(2, "%s\n", __func__);
 
 	if (ntb->type == NTB_ATOM) {
 		pci_write_config(ntb->device, NTB_PPD_OFFSET,
 		    ntb->ppd | ATOM_PPD_INIT_LINK, 4);
 		return (0);
 	}
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
 		ntb_link_event(dev);
 		return (0);
 	}
 
 	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
 	cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
 	cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		cntl |= NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP;
 	intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
 	return (0);
 }
 
 static int
 intel_ntb_link_disable(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
 	intel_ntb_printf(2, "%s\n", __func__);
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT) {
 		ntb_link_event(dev);
 		return (0);
 	}
 
 	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
 	cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR))
 		cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP | NTB_CNTL_S2P_BAR5_SNOOP);
 	cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
 	intel_ntb_reg_write(4, ntb->reg->ntb_ctl, cntl);
 	return (0);
 }
 
 static bool
 intel_ntb_link_enabled(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	uint32_t cntl;
 
 	if (ntb->type == NTB_ATOM) {
 		cntl = pci_read_config(ntb->device, NTB_PPD_OFFSET, 4);
 		return ((cntl & ATOM_PPD_INIT_LINK) != 0);
 	}
 
 	if (ntb->conn_type == NTB_CONN_TRANSPARENT)
 		return (true);
 
 	cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	return ((cntl & NTB_CNTL_LINK_DISABLE) == 0);
 }
 
 static void
 recover_atom_link(void *arg)
 {
 	struct ntb_softc *ntb = arg;
 	unsigned speed, width, oldspeed, oldwidth;
 	uint32_t status32;
 
 	atom_perform_link_restart(ntb);
 
 	/*
 	 * There is a potential race between the 2 NTB devices recovering at
 	 * the same time.  If the times are the same, the link will not recover
 	 * and the driver will be stuck in this loop forever.  Add a random
 	 * interval to the recovery time to prevent this race.
 	 */
 	status32 = arc4random() % ATOM_LINK_RECOVERY_TIME;
 	pause("Link", (ATOM_LINK_RECOVERY_TIME + status32) * hz / 1000);
 
 	if (atom_link_is_err(ntb))
 		goto retry;
 
 	status32 = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 	if ((status32 & ATOM_CNTL_LINK_DOWN) != 0)
 		goto out;
 
 	status32 = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
 	width = NTB_LNK_STA_WIDTH(status32);
 	speed = status32 & NTB_LINK_SPEED_MASK;
 
 	oldwidth = NTB_LNK_STA_WIDTH(ntb->lnk_sta);
 	oldspeed = ntb->lnk_sta & NTB_LINK_SPEED_MASK;
 	if (oldwidth != width || oldspeed != speed)
 		goto retry;
 
 out:
 	callout_reset(&ntb->heartbeat_timer, NTB_HB_TIMEOUT * hz, atom_link_hb,
 	    ntb);
 	return;
 
 retry:
 	callout_reset(&ntb->lr_timer, NTB_HB_TIMEOUT * hz, recover_atom_link,
 	    ntb);
 }
 
 /*
  * Polls the HW link status register(s); returns true if something has changed.
  */
 static bool
 intel_ntb_poll_link(struct ntb_softc *ntb)
 {
 	uint32_t ntb_cntl;
 	uint16_t reg_val;
 
 	if (ntb->type == NTB_ATOM) {
 		ntb_cntl = intel_ntb_reg_read(4, ntb->reg->ntb_ctl);
 		if (ntb_cntl == ntb->ntb_ctl)
 			return (false);
 
 		ntb->ntb_ctl = ntb_cntl;
 		ntb->lnk_sta = intel_ntb_reg_read(4, ntb->reg->lnk_sta);
 	} else {
 		db_iowrite_raw(ntb, ntb->self_reg->db_bell, ntb->db_link_mask);
 
 		reg_val = pci_read_config(ntb->device, ntb->reg->lnk_sta, 2);
 		if (reg_val == ntb->lnk_sta)
 			return (false);
 
 		ntb->lnk_sta = reg_val;
 
 		if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 			if (_xeon_link_is_up(ntb)) {
 				if (!ntb->peer_msix_good) {
 					callout_reset(&ntb->peer_msix_work, 0,
 					    intel_ntb_exchange_msix, ntb);
 					return (false);
 				}
 			} else {
 				ntb->peer_msix_good = false;
 				ntb->peer_msix_done = false;
 			}
 		}
 	}
 	return (true);
 }
 
 static inline enum ntb_speed
 intel_ntb_link_sta_speed(struct ntb_softc *ntb)
 {
 
 	if (!link_is_up(ntb))
 		return (NTB_SPEED_NONE);
 	return (ntb->lnk_sta & NTB_LINK_SPEED_MASK);
 }
 
 static inline enum ntb_width
 intel_ntb_link_sta_width(struct ntb_softc *ntb)
 {
 
 	if (!link_is_up(ntb))
 		return (NTB_WIDTH_NONE);
 	return (NTB_LNK_STA_WIDTH(ntb->lnk_sta));
 }
 
 SYSCTL_NODE(_hw_ntb, OID_AUTO, debug_info, CTLFLAG_RW, 0,
     "Driver state, statistics, and HW registers");
 
 #define NTB_REGSZ_MASK	(3ul << 30)
 #define NTB_REG_64	(1ul << 30)
 #define NTB_REG_32	(2ul << 30)
 #define NTB_REG_16	(3ul << 30)
 #define NTB_REG_8	(0ul << 30)
 
 #define NTB_DB_READ	(1ul << 29)
 #define NTB_PCI_REG	(1ul << 28)
 #define NTB_REGFLAGS_MASK	(NTB_REGSZ_MASK | NTB_DB_READ | NTB_PCI_REG)
 
 static void
 intel_ntb_sysctl_init(struct ntb_softc *ntb)
 {
 	struct sysctl_oid_list *globals, *tree_par, *regpar, *statpar, *errpar;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree, *tmptree;
 
 	ctx = device_get_sysctl_ctx(ntb->device);
 	globals = SYSCTL_CHILDREN(device_get_sysctl_tree(ntb->device));
 
 	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "link_status",
 	    CTLFLAG_RD | CTLTYPE_STRING, ntb, 0,
 	    sysctl_handle_link_status_human, "A",
 	    "Link status (human readable)");
 	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "active",
 	    CTLFLAG_RD | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_status,
 	    "IU", "Link status (1=active, 0=inactive)");
 	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "admin_up",
 	    CTLFLAG_RW | CTLTYPE_UINT, ntb, 0, sysctl_handle_link_admin,
 	    "IU", "Set/get interface status (1=UP, 0=DOWN)");
 
 	tree = SYSCTL_ADD_NODE(ctx, globals, OID_AUTO, "debug_info",
 	    CTLFLAG_RD, NULL, "Driver state, statistics, and HW registers");
 	tree_par = SYSCTL_CHILDREN(tree);
 
 	SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "conn_type", CTLFLAG_RD,
 	    &ntb->conn_type, 0, "0 - Transparent; 1 - B2B; 2 - Root Port");
 	SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "dev_type", CTLFLAG_RD,
 	    &ntb->dev_type, 0, "0 - USD; 1 - DSD");
 	SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "ppd", CTLFLAG_RD,
 	    &ntb->ppd, 0, "Raw PPD register (cached)");
 
 	if (ntb->b2b_mw_idx != B2B_MW_DISABLED) {
 		SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "b2b_idx", CTLFLAG_RD,
 		    &ntb->b2b_mw_idx, 0,
 		    "Index of the MW used for B2B remote register access");
 		SYSCTL_ADD_UQUAD(ctx, tree_par, OID_AUTO, "b2b_off",
 		    CTLFLAG_RD, &ntb->b2b_off,
 		    "If non-zero, offset of B2B register region in shared MW");
 	}
 
 	SYSCTL_ADD_PROC(ctx, tree_par, OID_AUTO, "features",
 	    CTLFLAG_RD | CTLTYPE_STRING, ntb, 0, sysctl_handle_features, "A",
 	    "Features/errata of this NTB device");
 
 	SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "ntb_ctl", CTLFLAG_RD,
 	    __DEVOLATILE(uint32_t *, &ntb->ntb_ctl), 0,
 	    "NTB CTL register (cached)");
 	SYSCTL_ADD_UINT(ctx, tree_par, OID_AUTO, "lnk_sta", CTLFLAG_RD,
 	    __DEVOLATILE(uint32_t *, &ntb->lnk_sta), 0,
 	    "LNK STA register (cached)");
 
 	SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "mw_count", CTLFLAG_RD,
 	    &ntb->mw_count, 0, "MW count");
 	SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "spad_count", CTLFLAG_RD,
 	    &ntb->spad_count, 0, "Scratchpad count");
 	SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "db_count", CTLFLAG_RD,
 	    &ntb->db_count, 0, "Doorbell count");
 	SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "db_vec_count", CTLFLAG_RD,
 	    &ntb->db_vec_count, 0, "Doorbell vector count");
 	SYSCTL_ADD_U8(ctx, tree_par, OID_AUTO, "db_vec_shift", CTLFLAG_RD,
 	    &ntb->db_vec_shift, 0, "Doorbell vector shift");
 
 	SYSCTL_ADD_UQUAD(ctx, tree_par, OID_AUTO, "db_valid_mask", CTLFLAG_RD,
 	    &ntb->db_valid_mask, "Doorbell valid mask");
 	SYSCTL_ADD_UQUAD(ctx, tree_par, OID_AUTO, "db_link_mask", CTLFLAG_RD,
 	    &ntb->db_link_mask, "Doorbell link mask");
 	SYSCTL_ADD_UQUAD(ctx, tree_par, OID_AUTO, "db_mask", CTLFLAG_RD,
 	    &ntb->db_mask, "Doorbell mask (cached)");
 
 	tmptree = SYSCTL_ADD_NODE(ctx, tree_par, OID_AUTO, "registers",
 	    CTLFLAG_RD, NULL, "Raw HW registers (big-endian)");
 	regpar = SYSCTL_CHILDREN(tmptree);
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "ntbcntl",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb, NTB_REG_32 |
 	    ntb->reg->ntb_ctl, sysctl_handle_register, "IU",
 	    "NTB Control register");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "lnkcap",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb, NTB_REG_32 |
 	    0x19c, sysctl_handle_register, "IU",
 	    "NTB Link Capabilities");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "lnkcon",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb, NTB_REG_32 |
 	    0x1a0, sysctl_handle_register, "IU",
 	    "NTB Link Control register");
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "db_mask",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | NTB_DB_READ | ntb->self_reg->db_mask,
 	    sysctl_handle_register, "QU", "Doorbell mask register");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "db_bell",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | NTB_DB_READ | ntb->self_reg->db_bell,
 	    sysctl_handle_register, "QU", "Doorbell register");
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_xlat23",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_xlat,
 	    sysctl_handle_register, "QU", "Incoming XLAT23 register");
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_xlat4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_xlat,
 		    sysctl_handle_register, "IU", "Incoming XLAT4 register");
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_xlat5",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar5_xlat,
 		    sysctl_handle_register, "IU", "Incoming XLAT5 register");
 	} else {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_xlat45",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_64 | ntb->xlat_reg->bar4_xlat,
 		    sysctl_handle_register, "QU", "Incoming XLAT45 register");
 	}
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_lmt23",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_limit,
 	    sysctl_handle_register, "QU", "Incoming LMT23 register");
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_lmt4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_limit,
 		    sysctl_handle_register, "IU", "Incoming LMT4 register");
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_lmt5",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar5_limit,
 		    sysctl_handle_register, "IU", "Incoming LMT5 register");
 	} else {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "incoming_lmt45",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_64 | ntb->xlat_reg->bar4_limit,
 		    sysctl_handle_register, "QU", "Incoming LMT45 register");
 	}
 
 	if (ntb->type == NTB_ATOM)
 		return;
 
 	tmptree = SYSCTL_ADD_NODE(ctx, regpar, OID_AUTO, "xeon_stats",
 	    CTLFLAG_RD, NULL, "Xeon HW statistics");
 	statpar = SYSCTL_CHILDREN(tmptree);
 	SYSCTL_ADD_PROC(ctx, statpar, OID_AUTO, "upstream_mem_miss",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_16 | XEON_USMEMMISS_OFFSET,
 	    sysctl_handle_register, "SU", "Upstream Memory Miss");
 
 	tmptree = SYSCTL_ADD_NODE(ctx, regpar, OID_AUTO, "xeon_hw_err",
 	    CTLFLAG_RD, NULL, "Xeon HW errors");
 	errpar = SYSCTL_CHILDREN(tmptree);
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "ppd",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_8 | NTB_PCI_REG | NTB_PPD_OFFSET,
 	    sysctl_handle_register, "CU", "PPD");
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "pbar23_sz",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_8 | NTB_PCI_REG | XEON_PBAR23SZ_OFFSET,
 	    sysctl_handle_register, "CU", "PBAR23 SZ (log2)");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "pbar4_sz",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_8 | NTB_PCI_REG | XEON_PBAR4SZ_OFFSET,
 	    sysctl_handle_register, "CU", "PBAR4 SZ (log2)");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "pbar5_sz",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_8 | NTB_PCI_REG | XEON_PBAR5SZ_OFFSET,
 	    sysctl_handle_register, "CU", "PBAR5 SZ (log2)");
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar23_sz",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_8 | NTB_PCI_REG | XEON_SBAR23SZ_OFFSET,
 	    sysctl_handle_register, "CU", "SBAR23 SZ (log2)");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar4_sz",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_8 | NTB_PCI_REG | XEON_SBAR4SZ_OFFSET,
 	    sysctl_handle_register, "CU", "SBAR4 SZ (log2)");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar5_sz",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_8 | NTB_PCI_REG | XEON_SBAR5SZ_OFFSET,
 	    sysctl_handle_register, "CU", "SBAR5 SZ (log2)");
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "devsts",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_16 | NTB_PCI_REG | XEON_DEVSTS_OFFSET,
 	    sysctl_handle_register, "SU", "DEVSTS");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "lnksts",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_16 | NTB_PCI_REG | XEON_LINK_STATUS_OFFSET,
 	    sysctl_handle_register, "SU", "LNKSTS");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "slnksts",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_16 | NTB_PCI_REG | XEON_SLINK_STATUS_OFFSET,
 	    sysctl_handle_register, "SU", "SLNKSTS");
 
 	SYSCTL_ADD_PROC(ctx, errpar, OID_AUTO, "uncerrsts",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_32 | NTB_PCI_REG | XEON_UNCERRSTS_OFFSET,
 	    sysctl_handle_register, "IU", "UNCERRSTS");
 	SYSCTL_ADD_PROC(ctx, errpar, OID_AUTO, "corerrsts",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_32 | NTB_PCI_REG | XEON_CORERRSTS_OFFSET,
 	    sysctl_handle_register, "IU", "CORERRSTS");
 
 	if (ntb->conn_type != NTB_CONN_B2B)
 		return;
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_xlat23",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->bar_info[NTB_B2B_BAR_1].pbarxlat_off,
 	    sysctl_handle_register, "QU", "Outgoing XLAT23 register");
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_xlat4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off,
 		    sysctl_handle_register, "IU", "Outgoing XLAT4 register");
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_xlat5",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->bar_info[NTB_B2B_BAR_3].pbarxlat_off,
 		    sysctl_handle_register, "IU", "Outgoing XLAT5 register");
 	} else {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_xlat45",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_64 | ntb->bar_info[NTB_B2B_BAR_2].pbarxlat_off,
 		    sysctl_handle_register, "QU", "Outgoing XLAT45 register");
 	}
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_lmt23",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | XEON_PBAR2LMT_OFFSET,
 	    sysctl_handle_register, "QU", "Outgoing LMT23 register");
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_lmt4",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | XEON_PBAR4LMT_OFFSET,
 		    sysctl_handle_register, "IU", "Outgoing LMT4 register");
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_lmt5",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | XEON_PBAR5LMT_OFFSET,
 		    sysctl_handle_register, "IU", "Outgoing LMT5 register");
 	} else {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "outgoing_lmt45",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_64 | XEON_PBAR4LMT_OFFSET,
 		    sysctl_handle_register, "QU", "Outgoing LMT45 register");
 	}
 
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar01_base",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar0_base,
 	    sysctl_handle_register, "QU", "Secondary BAR01 base register");
 	SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar23_base",
 	    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 	    NTB_REG_64 | ntb->xlat_reg->bar2_base,
 	    sysctl_handle_register, "QU", "Secondary BAR23 base register");
 	if (HAS_FEATURE(ntb, NTB_SPLIT_BAR)) {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar4_base",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar4_base,
 		    sysctl_handle_register, "IU",
 		    "Secondary BAR4 base register");
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar5_base",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_32 | ntb->xlat_reg->bar5_base,
 		    sysctl_handle_register, "IU",
 		    "Secondary BAR5 base register");
 	} else {
 		SYSCTL_ADD_PROC(ctx, regpar, OID_AUTO, "sbar45_base",
 		    CTLFLAG_RD | CTLTYPE_OPAQUE, ntb,
 		    NTB_REG_64 | ntb->xlat_reg->bar4_base,
 		    sysctl_handle_register, "QU",
 		    "Secondary BAR45 base register");
 	}
 }
 
 static int
 sysctl_handle_features(SYSCTL_HANDLER_ARGS)
 {
 	struct ntb_softc *ntb = arg1;
 	struct sbuf sb;
 	int error;
 
 	sbuf_new_for_sysctl(&sb, NULL, 256, req);
 
 	sbuf_printf(&sb, "%b", ntb->features, NTB_FEATURES_STR);
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (error || !req->newptr)
 		return (error);
 	return (EINVAL);
 }
 
 static int
 sysctl_handle_link_admin(SYSCTL_HANDLER_ARGS)
 {
 	struct ntb_softc *ntb = arg1;
 	unsigned old, new;
 	int error;
 
 	old = intel_ntb_link_enabled(ntb->device);
 
 	error = SYSCTL_OUT(req, &old, sizeof(old));
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = SYSCTL_IN(req, &new, sizeof(new));
 	if (error != 0)
 		return (error);
 
 	intel_ntb_printf(0, "Admin set interface state to '%sabled'\n",
 	    (new != 0)? "en" : "dis");
 
 	if (new != 0)
 		error = intel_ntb_link_enable(ntb->device, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
 	else
 		error = intel_ntb_link_disable(ntb->device);
 	return (error);
 }
 
 static int
 sysctl_handle_link_status_human(SYSCTL_HANDLER_ARGS)
 {
 	struct ntb_softc *ntb = arg1;
 	struct sbuf sb;
 	enum ntb_speed speed;
 	enum ntb_width width;
 	int error;
 
 	sbuf_new_for_sysctl(&sb, NULL, 32, req);
 
 	if (intel_ntb_link_is_up(ntb->device, &speed, &width))
 		sbuf_printf(&sb, "up / PCIe Gen %u / Width x%u",
 		    (unsigned)speed, (unsigned)width);
 	else
 		sbuf_printf(&sb, "down");
 
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (error || !req->newptr)
 		return (error);
 	return (EINVAL);
 }
 
 static int
 sysctl_handle_link_status(SYSCTL_HANDLER_ARGS)
 {
 	struct ntb_softc *ntb = arg1;
 	unsigned res;
 	int error;
 
 	res = intel_ntb_link_is_up(ntb->device, NULL, NULL);
 
 	error = SYSCTL_OUT(req, &res, sizeof(res));
 	if (error || !req->newptr)
 		return (error);
 	return (EINVAL);
 }
 
 static int
 sysctl_handle_register(SYSCTL_HANDLER_ARGS)
 {
 	struct ntb_softc *ntb;
 	const void *outp;
 	uintptr_t sz;
 	uint64_t umv;
 	char be[sizeof(umv)];
 	size_t outsz;
 	uint32_t reg;
 	bool db, pci;
 	int error;
 
 	ntb = arg1;
 	reg = arg2 & ~NTB_REGFLAGS_MASK;
 	sz = arg2 & NTB_REGSZ_MASK;
 	db = (arg2 & NTB_DB_READ) != 0;
 	pci = (arg2 & NTB_PCI_REG) != 0;
 
 	KASSERT(!(db && pci), ("bogus"));
 
 	if (db) {
 		KASSERT(sz == NTB_REG_64, ("bogus"));
 		umv = db_ioread(ntb, reg);
 		outsz = sizeof(uint64_t);
 	} else {
 		switch (sz) {
 		case NTB_REG_64:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 8);
 			else
 				umv = intel_ntb_reg_read(8, reg);
 			outsz = sizeof(uint64_t);
 			break;
 		case NTB_REG_32:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 4);
 			else
 				umv = intel_ntb_reg_read(4, reg);
 			outsz = sizeof(uint32_t);
 			break;
 		case NTB_REG_16:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 2);
 			else
 				umv = intel_ntb_reg_read(2, reg);
 			outsz = sizeof(uint16_t);
 			break;
 		case NTB_REG_8:
 			if (pci)
 				umv = pci_read_config(ntb->device, reg, 1);
 			else
 				umv = intel_ntb_reg_read(1, reg);
 			outsz = sizeof(uint8_t);
 			break;
 		default:
 			panic("bogus");
 			break;
 		}
 	}
 
 	/* Encode bigendian so that sysctl -x is legible. */
 	be64enc(be, umv);
 	outp = ((char *)be) + sizeof(umv) - outsz;
 
 	error = SYSCTL_OUT(req, outp, outsz);
 	if (error || !req->newptr)
 		return (error);
 	return (EINVAL);
 }
 
 static unsigned
 intel_ntb_user_mw_to_idx(struct ntb_softc *ntb, unsigned uidx)
 {
 
 	if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
 	    uidx >= ntb->b2b_mw_idx) ||
 	    (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx))
 		uidx++;
 	if ((ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0 &&
 	    uidx >= ntb->b2b_mw_idx) &&
 	    (ntb->msix_mw_idx != B2B_MW_DISABLED && uidx >= ntb->msix_mw_idx))
 		uidx++;
 	return (uidx);
 }
 
 static void
 intel_ntb_exchange_msix(void *ctx)
 {
 	struct ntb_softc *ntb;
 	uint32_t val;
 	unsigned i;
 
 	ntb = ctx;
 
 	if (ntb->peer_msix_good)
 		goto msix_good;
 	if (ntb->peer_msix_done)
 		goto msix_done;
 
 	intel_ntb_get_msix_info(ntb);
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
 		intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DATA0 + i,
 		    ntb->msix_data[i].nmd_data);
 		intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_OFS0 + i,
 		    ntb->msix_data[i].nmd_ofs - ntb->msix_xlat);
 	}
 	intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_GUARD, NTB_MSIX_VER_GUARD);
 
 	intel_ntb_spad_read(ntb->device, NTB_MSIX_GUARD, &val);
 	if (val != NTB_MSIX_VER_GUARD)
 		goto reschedule;
 
 	for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
 		intel_ntb_spad_read(ntb->device, NTB_MSIX_DATA0 + i, &val);
 		intel_ntb_printf(2, "remote MSIX data(%u): 0x%x\n", i, val);
 		ntb->peer_msix_data[i].nmd_data = val;
 		intel_ntb_spad_read(ntb->device, NTB_MSIX_OFS0 + i, &val);
 		intel_ntb_printf(2, "remote MSIX addr(%u): 0x%x\n", i, val);
 		ntb->peer_msix_data[i].nmd_ofs = val;
 	}
 
 	ntb->peer_msix_done = true;
 
 msix_done:
 	intel_ntb_peer_spad_write(ntb->device, NTB_MSIX_DONE, NTB_MSIX_RECEIVED);
 	intel_ntb_spad_read(ntb->device, NTB_MSIX_DONE, &val);
 	if (val != NTB_MSIX_RECEIVED)
 		goto reschedule;
 
+	intel_ntb_spad_clear(ntb->device);
 	ntb->peer_msix_good = true;
 	/* Give peer time to see our NTB_MSIX_RECEIVED. */
 	goto reschedule;
 
 msix_good:
 	intel_ntb_poll_link(ntb);
 	ntb_link_event(ntb->device);
 	return;
 
 reschedule:
 	ntb->lnk_sta = pci_read_config(ntb->device, ntb->reg->lnk_sta, 2);
 	if (_xeon_link_is_up(ntb)) {
 		callout_reset(&ntb->peer_msix_work,
 		    hz * (ntb->peer_msix_good ? 2 : 1) / 100,
 		    intel_ntb_exchange_msix, ntb);
 	} else
 		intel_ntb_spad_clear(ntb->device);
 }
 
 /*
  * Public API to the rest of the OS
  */
 
 static uint8_t
 intel_ntb_spad_count(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	return (ntb->spad_count);
 }
 
 static uint8_t
 intel_ntb_mw_count(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	uint8_t res;
 
 	res = ntb->mw_count;
 	if (ntb->b2b_mw_idx != B2B_MW_DISABLED && ntb->b2b_off == 0)
 		res--;
 	if (ntb->msix_mw_idx != B2B_MW_DISABLED)
 		res--;
 	return (res);
 }
 
 static int
 intel_ntb_spad_write(device_t dev, unsigned int idx, uint32_t val)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
 	intel_ntb_reg_write(4, ntb->self_reg->spad + idx * 4, val);
 
 	return (0);
 }
 
 /*
  * Zeros the local scratchpad.
  */
 static void
 intel_ntb_spad_clear(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	unsigned i;
 
 	for (i = 0; i < ntb->spad_count; i++)
 		intel_ntb_spad_write(dev, i, 0);
 }
 
 static int
 intel_ntb_spad_read(device_t dev, unsigned int idx, uint32_t *val)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
 	*val = intel_ntb_reg_read(4, ntb->self_reg->spad + idx * 4);
 
 	return (0);
 }
 
 static int
 intel_ntb_peer_spad_write(device_t dev, unsigned int idx, uint32_t val)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
 	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
 		intel_ntb_mw_write(4, XEON_SPAD_OFFSET + idx * 4, val);
 	else
 		intel_ntb_reg_write(4, ntb->peer_reg->spad + idx * 4, val);
 
 	return (0);
 }
 
 static int
 intel_ntb_peer_spad_read(device_t dev, unsigned int idx, uint32_t *val)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= ntb->spad_count)
 		return (EINVAL);
 
 	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP))
 		*val = intel_ntb_mw_read(4, XEON_SPAD_OFFSET + idx * 4);
 	else
 		*val = intel_ntb_reg_read(4, ntb->peer_reg->spad + idx * 4);
 
 	return (0);
 }
 
 static int
 intel_ntb_mw_get_range(device_t dev, unsigned mw_idx, vm_paddr_t *base,
     caddr_t *vbase, size_t *size, size_t *align, size_t *align_size,
     bus_addr_t *plimit)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	bus_addr_t limit;
 	size_t bar_b2b_off;
 	enum ntb_bar bar_num;
 
 	if (mw_idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
 	mw_idx = intel_ntb_user_mw_to_idx(ntb, mw_idx);
 
 	bar_num = intel_ntb_mw_to_bar(ntb, mw_idx);
 	bar = &ntb->bar_info[bar_num];
 	bar_b2b_off = 0;
 	if (mw_idx == ntb->b2b_mw_idx) {
 		KASSERT(ntb->b2b_off != 0,
 		    ("user shouldn't get non-shared b2b mw"));
 		bar_b2b_off = ntb->b2b_off;
 	}
 
 	if (bar_is_64bit(ntb, bar_num))
 		limit = BUS_SPACE_MAXADDR;
 	else
 		limit = BUS_SPACE_MAXADDR_32BIT;
 
 	if (base != NULL)
 		*base = bar->pbase + bar_b2b_off;
 	if (vbase != NULL)
 		*vbase = bar->vbase + bar_b2b_off;
 	if (size != NULL)
 		*size = bar->size - bar_b2b_off;
 	if (align != NULL)
 		*align = bar->size;
 	if (align_size != NULL)
 		*align_size = 1;
 	if (plimit != NULL)
 		*plimit = limit;
 	return (0);
 }
 
 static int
 intel_ntb_mw_set_trans(device_t dev, unsigned idx, bus_addr_t addr, size_t size)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	uint64_t base, limit, reg_val;
 	size_t bar_size, mw_size;
 	uint32_t base_reg, xlat_reg, limit_reg;
 	enum ntb_bar bar_num;
 
 	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
 	idx = intel_ntb_user_mw_to_idx(ntb, idx);
 
 	bar_num = intel_ntb_mw_to_bar(ntb, idx);
 	bar = &ntb->bar_info[bar_num];
 
 	bar_size = bar->size;
 	if (idx == ntb->b2b_mw_idx)
 		mw_size = bar_size - ntb->b2b_off;
 	else
 		mw_size = bar_size;
 
 	/* Hardware requires that addr is aligned to bar size */
 	if ((addr & (bar_size - 1)) != 0)
 		return (EINVAL);
 
 	if (size > mw_size)
 		return (EINVAL);
 
 	bar_get_xlat_params(ntb, bar_num, &base_reg, &xlat_reg, &limit_reg);
 
 	limit = 0;
 	if (bar_is_64bit(ntb, bar_num)) {
 		base = intel_ntb_reg_read(8, base_reg) & BAR_HIGH_MASK;
 
 		if (limit_reg != 0 && size != mw_size)
 			limit = base + size;
 
 		/* Set and verify translation address */
 		intel_ntb_reg_write(8, xlat_reg, addr);
 		reg_val = intel_ntb_reg_read(8, xlat_reg) & BAR_HIGH_MASK;
 		if (reg_val != addr) {
 			intel_ntb_reg_write(8, xlat_reg, 0);
 			return (EIO);
 		}
 
 		/* Set and verify the limit */
 		intel_ntb_reg_write(8, limit_reg, limit);
 		reg_val = intel_ntb_reg_read(8, limit_reg) & BAR_HIGH_MASK;
 		if (reg_val != limit) {
 			intel_ntb_reg_write(8, limit_reg, base);
 			intel_ntb_reg_write(8, xlat_reg, 0);
 			return (EIO);
 		}
 	} else {
 		/* Configure 32-bit (split) BAR MW */
 
 		if ((addr & UINT32_MAX) != addr)
 			return (ERANGE);
 		if (((addr + size) & UINT32_MAX) != (addr + size))
 			return (ERANGE);
 
 		base = intel_ntb_reg_read(4, base_reg) & BAR_HIGH_MASK;
 
 		if (limit_reg != 0 && size != mw_size)
 			limit = base + size;
 
 		/* Set and verify translation address */
 		intel_ntb_reg_write(4, xlat_reg, addr);
 		reg_val = intel_ntb_reg_read(4, xlat_reg) & BAR_HIGH_MASK;
 		if (reg_val != addr) {
 			intel_ntb_reg_write(4, xlat_reg, 0);
 			return (EIO);
 		}
 
 		/* Set and verify the limit */
 		intel_ntb_reg_write(4, limit_reg, limit);
 		reg_val = intel_ntb_reg_read(4, limit_reg) & BAR_HIGH_MASK;
 		if (reg_val != limit) {
 			intel_ntb_reg_write(4, limit_reg, base);
 			intel_ntb_reg_write(4, xlat_reg, 0);
 			return (EIO);
 		}
 	}
 	return (0);
 }
 
 static int
 intel_ntb_mw_clear_trans(device_t dev, unsigned mw_idx)
 {
 
 	return (intel_ntb_mw_set_trans(dev, mw_idx, 0, 0));
 }
 
 static int
 intel_ntb_mw_get_wc(device_t dev, unsigned idx, vm_memattr_t *mode)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 
 	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
 	idx = intel_ntb_user_mw_to_idx(ntb, idx);
 
 	bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
 	*mode = bar->map_mode;
 	return (0);
 }
 
 static int
 intel_ntb_mw_set_wc(device_t dev, unsigned idx, vm_memattr_t mode)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (idx >= intel_ntb_mw_count(dev))
 		return (EINVAL);
 
 	idx = intel_ntb_user_mw_to_idx(ntb, idx);
 	return (intel_ntb_mw_set_wc_internal(ntb, idx, mode));
 }
 
 static int
 intel_ntb_mw_set_wc_internal(struct ntb_softc *ntb, unsigned idx, vm_memattr_t mode)
 {
 	struct ntb_pci_bar_info *bar;
 	int rc;
 
 	bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, idx)];
 	if (bar->map_mode == mode)
 		return (0);
 
 	rc = pmap_change_attr((vm_offset_t)bar->vbase, bar->size, mode);
 	if (rc == 0)
 		bar->map_mode = mode;
 
 	return (rc);
 }
 
 static void
 intel_ntb_peer_db_set(device_t dev, uint64_t bit)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (HAS_FEATURE(ntb, NTB_SB01BASE_LOCKUP)) {
 		struct ntb_pci_bar_info *lapic;
 		unsigned i;
 
 		lapic = ntb->peer_lapic_bar;
 
 		for (i = 0; i < XEON_NONLINK_DB_MSIX_BITS; i++) {
 			if ((bit & intel_ntb_db_vector_mask(dev, i)) != 0)
 				bus_space_write_4(lapic->pci_bus_tag,
 				    lapic->pci_bus_handle,
 				    ntb->peer_msix_data[i].nmd_ofs,
 				    ntb->peer_msix_data[i].nmd_data);
 		}
 		return;
 	}
 
 	if (HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
 		intel_ntb_mw_write(2, XEON_PDOORBELL_OFFSET, bit);
 		return;
 	}
 
 	db_iowrite(ntb, ntb->peer_reg->db_bell, bit);
 }
 
 static int
 intel_ntb_peer_db_addr(device_t dev, bus_addr_t *db_addr, vm_size_t *db_size)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 	struct ntb_pci_bar_info *bar;
 	uint64_t regoff;
 
 	KASSERT((db_addr != NULL && db_size != NULL), ("must be non-NULL"));
 
 	if (!HAS_FEATURE(ntb, NTB_SDOORBELL_LOCKUP)) {
 		bar = &ntb->bar_info[NTB_CONFIG_BAR];
 		regoff = ntb->peer_reg->db_bell;
 	} else {
 		KASSERT(ntb->b2b_mw_idx != B2B_MW_DISABLED,
 		    ("invalid b2b idx"));
 
 		bar = &ntb->bar_info[intel_ntb_mw_to_bar(ntb, ntb->b2b_mw_idx)];
 		regoff = XEON_PDOORBELL_OFFSET;
 	}
 	KASSERT(bar->pci_bus_tag != X86_BUS_SPACE_IO, ("uh oh"));
 
 	/* HACK: Specific to current x86 bus implementation. */
 	*db_addr = ((uint64_t)bar->pci_bus_handle + regoff);
 	*db_size = ntb->reg->db_size;
 	return (0);
 }
 
 static uint64_t
 intel_ntb_db_valid_mask(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	return (ntb->db_valid_mask);
 }
 
 static int
 intel_ntb_db_vector_count(device_t dev)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	return (ntb->db_vec_count);
 }
 
 static uint64_t
 intel_ntb_db_vector_mask(device_t dev, uint32_t vector)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (vector > ntb->db_vec_count)
 		return (0);
 	return (ntb->db_valid_mask & intel_ntb_vec_mask(ntb, vector));
 }
 
 static bool
 intel_ntb_link_is_up(device_t dev, enum ntb_speed *speed, enum ntb_width *width)
 {
 	struct ntb_softc *ntb = device_get_softc(dev);
 
 	if (speed != NULL)
 		*speed = intel_ntb_link_sta_speed(ntb);
 	if (width != NULL)
 		*width = intel_ntb_link_sta_width(ntb);
 	return (link_is_up(ntb));
 }
 
 static void
 save_bar_parameters(struct ntb_pci_bar_info *bar)
 {
 
 	bar->pci_bus_tag = rman_get_bustag(bar->pci_resource);
 	bar->pci_bus_handle = rman_get_bushandle(bar->pci_resource);
 	bar->pbase = rman_get_start(bar->pci_resource);
 	bar->size = rman_get_size(bar->pci_resource);
 	bar->vbase = rman_get_virtual(bar->pci_resource);
 }
 
 static device_method_t ntb_intel_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		intel_ntb_probe),
 	DEVMETHOD(device_attach,	intel_ntb_attach),
 	DEVMETHOD(device_detach,	intel_ntb_detach),
 	/* NTB interface */
 	DEVMETHOD(ntb_link_is_up,	intel_ntb_link_is_up),
 	DEVMETHOD(ntb_link_enable,	intel_ntb_link_enable),
 	DEVMETHOD(ntb_link_disable,	intel_ntb_link_disable),
 	DEVMETHOD(ntb_link_enabled,	intel_ntb_link_enabled),
 	DEVMETHOD(ntb_mw_count,		intel_ntb_mw_count),
 	DEVMETHOD(ntb_mw_get_range,	intel_ntb_mw_get_range),
 	DEVMETHOD(ntb_mw_set_trans,	intel_ntb_mw_set_trans),
 	DEVMETHOD(ntb_mw_clear_trans,	intel_ntb_mw_clear_trans),
 	DEVMETHOD(ntb_mw_get_wc,	intel_ntb_mw_get_wc),
 	DEVMETHOD(ntb_mw_set_wc,	intel_ntb_mw_set_wc),
 	DEVMETHOD(ntb_spad_count,	intel_ntb_spad_count),
 	DEVMETHOD(ntb_spad_clear,	intel_ntb_spad_clear),
 	DEVMETHOD(ntb_spad_write,	intel_ntb_spad_write),
 	DEVMETHOD(ntb_spad_read,	intel_ntb_spad_read),
 	DEVMETHOD(ntb_peer_spad_write,	intel_ntb_peer_spad_write),
 	DEVMETHOD(ntb_peer_spad_read,	intel_ntb_peer_spad_read),
 	DEVMETHOD(ntb_db_valid_mask,	intel_ntb_db_valid_mask),
 	DEVMETHOD(ntb_db_vector_count,	intel_ntb_db_vector_count),
 	DEVMETHOD(ntb_db_vector_mask,	intel_ntb_db_vector_mask),
 	DEVMETHOD(ntb_db_clear,		intel_ntb_db_clear),
 	DEVMETHOD(ntb_db_clear_mask,	intel_ntb_db_clear_mask),
 	DEVMETHOD(ntb_db_read,		intel_ntb_db_read),
 	DEVMETHOD(ntb_db_set_mask,	intel_ntb_db_set_mask),
 	DEVMETHOD(ntb_peer_db_addr,	intel_ntb_peer_db_addr),
 	DEVMETHOD(ntb_peer_db_set,	intel_ntb_peer_db_set),
 	DEVMETHOD_END
 };
 
 static DEFINE_CLASS_0(ntb_hw, ntb_intel_driver, ntb_intel_methods,
     sizeof(struct ntb_softc));
 DRIVER_MODULE(ntb_intel, pci, ntb_intel_driver, ntb_hw_devclass, NULL, NULL);
 MODULE_DEPEND(ntb_intel, ntb, 1, 1, 1);
 MODULE_VERSION(ntb_intel, 1);
Index: user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_transport.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_transport.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_transport.c	(revision 303517)
@@ -1,1479 +1,1518 @@
 /*-
  * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * Copyright (C) 2013 Intel Corporation
  * Copyright (C) 2015 EMC Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The Non-Transparent Bridge (NTB) is a device that allows you to connect
  * two or more systems using a PCI-e links, providing remote memory access.
  *
  * This module contains a transport for sending and receiving messages by
  * writing to remote memory window(s) provided by underlying NTB device.
  *
  * NOTE: Much of the code in this module is shared with Linux. Any patches may
  * be picked up and redistributed in Linux with a dual GPL/BSD license.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
-#include <sys/bitset.h>
 #include <sys/bus.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 
 #include "ntb.h"
 #include "ntb_transport.h"
 
-#define QP_SETSIZE	64
-BITSET_DEFINE(_qpset, QP_SETSIZE);
-#define test_bit(pos, addr)	BIT_ISSET(QP_SETSIZE, (pos), (addr))
-#define set_bit(pos, addr)	BIT_SET(QP_SETSIZE, (pos), (addr))
-#define clear_bit(pos, addr)	BIT_CLR(QP_SETSIZE, (pos), (addr))
-#define ffs_bit(addr)		BIT_FFS(QP_SETSIZE, (addr))
-
 #define KTR_NTB KTR_SPARE3
 
 #define NTB_TRANSPORT_VERSION	4
 
 static SYSCTL_NODE(_hw, OID_AUTO, ntb_transport, CTLFLAG_RW, 0, "ntb_transport");
 
 static unsigned g_ntb_transport_debug_level;
 SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, debug_level, CTLFLAG_RWTUN,
     &g_ntb_transport_debug_level, 0,
     "ntb_transport log level -- higher is more verbose");
 #define ntb_printf(lvl, ...) do {			\
 	if ((lvl) <= g_ntb_transport_debug_level) {	\
 		printf(__VA_ARGS__);			\
 	}						\
 } while (0)
 
 static unsigned transport_mtu = 0x10000;
 
 static uint64_t max_mw_size;
 SYSCTL_UQUAD(_hw_ntb_transport, OID_AUTO, max_mw_size, CTLFLAG_RDTUN, &max_mw_size, 0,
     "If enabled (non-zero), limit the size of large memory windows. "
     "Both sides of the NTB MUST set the same value here.");
 
-static unsigned max_num_clients;
-SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, max_num_clients, CTLFLAG_RDTUN,
-    &max_num_clients, 0, "Maximum number of NTB transport clients.  "
-    "0 (default) - use all available NTB memory windows; "
-    "positive integer N - Limit to N memory windows.");
-
 static unsigned enable_xeon_watchdog;
 SYSCTL_UINT(_hw_ntb_transport, OID_AUTO, enable_xeon_watchdog, CTLFLAG_RDTUN,
     &enable_xeon_watchdog, 0, "If non-zero, write a register every second to "
     "keep a watchdog from tearing down the NTB link");
 
 STAILQ_HEAD(ntb_queue_list, ntb_queue_entry);
 
 typedef uint32_t ntb_q_idx_t;
 
 struct ntb_queue_entry {
 	/* ntb_queue list reference */
 	STAILQ_ENTRY(ntb_queue_entry) entry;
 
 	/* info on data to be transferred */
 	void		*cb_data;
 	void		*buf;
 	uint32_t	len;
 	uint32_t	flags;
 
 	struct ntb_transport_qp		*qp;
 	struct ntb_payload_header	*x_hdr;
 	ntb_q_idx_t	index;
 };
 
 struct ntb_rx_info {
 	ntb_q_idx_t	entry;
 };
 
 struct ntb_transport_qp {
 	struct ntb_transport_ctx	*transport;
 	device_t		 dev;
 
 	void			*cb_data;
 
 	bool			client_ready;
 	volatile bool		link_is_up;
 	uint8_t			qp_num;	/* Only 64 QPs are allowed.  0-63 */
 
 	struct ntb_rx_info	*rx_info;
 	struct ntb_rx_info	*remote_rx_info;
 
 	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
 	    void *data, int len);
 	struct ntb_queue_list	tx_free_q;
 	struct mtx		ntb_tx_free_q_lock;
 	caddr_t			tx_mw;
 	bus_addr_t		tx_mw_phys;
 	ntb_q_idx_t		tx_index;
 	ntb_q_idx_t		tx_max_entry;
 	uint64_t		tx_max_frame;
 
 	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
 	    void *data, int len);
 	struct ntb_queue_list	rx_post_q;
 	struct ntb_queue_list	rx_pend_q;
 	/* ntb_rx_q_lock: synchronize access to rx_XXXX_q */
 	struct mtx		ntb_rx_q_lock;
 	struct task		rxc_db_work;
 	struct taskqueue	*rxc_tq;
 	caddr_t			rx_buff;
 	ntb_q_idx_t		rx_index;
 	ntb_q_idx_t		rx_max_entry;
 	uint64_t		rx_max_frame;
 
 	void (*event_handler)(void *data, enum ntb_link_event status);
 	struct callout		link_work;
 	struct callout		rx_full;
 
 	uint64_t		last_rx_no_buf;
 
 	/* Stats */
 	uint64_t		rx_bytes;
 	uint64_t		rx_pkts;
 	uint64_t		rx_ring_empty;
 	uint64_t		rx_err_no_buf;
 	uint64_t		rx_err_oflow;
 	uint64_t		rx_err_ver;
 	uint64_t		tx_bytes;
 	uint64_t		tx_pkts;
 	uint64_t		tx_ring_full;
 	uint64_t		tx_err_no_buf;
 
 	struct mtx		tx_lock;
 };
 
 struct ntb_transport_mw {
 	vm_paddr_t	phys_addr;
 	size_t		phys_size;
 	size_t		xlat_align;
 	size_t		xlat_align_size;
 	bus_addr_t	addr_limit;
 	/* Tx buff is off vbase / phys_addr */
 	caddr_t		vbase;
 	size_t		xlat_size;
 	size_t		buff_size;
 	/* Rx buff is off virt_addr / dma_addr */
 	caddr_t		virt_addr;
 	bus_addr_t	dma_addr;
 };
 
+struct ntb_transport_child {
+	device_t	dev;
+	int		qpoff;
+	int		qpcnt;
+	struct ntb_transport_child *next;
+};
+
 struct ntb_transport_ctx {
 	device_t		 dev;
+	struct ntb_transport_child *child;
 	struct ntb_transport_mw	*mw_vec;
 	struct ntb_transport_qp	*qp_vec;
-	struct _qpset		qp_bitmap;
-	struct _qpset		qp_bitmap_free;
 	unsigned		mw_count;
 	unsigned		qp_count;
+	uint64_t		qp_bitmap;
 	volatile bool		link_is_up;
 	struct callout		link_work;
 	struct callout		link_watchdog;
 	struct task		link_cleanup;
 };
 
 enum {
 	NTBT_DESC_DONE_FLAG = 1 << 0,
 	NTBT_LINK_DOWN_FLAG = 1 << 1,
 };
 
 struct ntb_payload_header {
 	ntb_q_idx_t ver;
 	uint32_t len;
 	uint32_t flags;
 };
 
 enum {
 	/*
 	 * The order of this enum is part of the remote protocol.  Do not
 	 * reorder without bumping protocol version (and it's probably best
 	 * to keep the protocol in lock-step with the Linux NTB driver.
 	 */
 	NTBT_VERSION = 0,
 	NTBT_QP_LINKS,
 	NTBT_NUM_QPS,
 	NTBT_NUM_MWS,
 	/*
 	 * N.B.: transport_link_work assumes MW1 enums = MW0 + 2.
 	 */
 	NTBT_MW0_SZ_HIGH,
 	NTBT_MW0_SZ_LOW,
 	NTBT_MW1_SZ_HIGH,
 	NTBT_MW1_SZ_LOW,
-	NTBT_MAX_SPAD,
 
 	/*
 	 * Some NTB-using hardware have a watchdog to work around NTB hangs; if
 	 * a register or doorbell isn't written every few seconds, the link is
 	 * torn down.  Write an otherwise unused register every few seconds to
 	 * work around this watchdog.
 	 */
 	NTBT_WATCHDOG_SPAD = 15
 };
 
 #define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
 #define NTB_QP_DEF_NUM_ENTRIES	100
 #define NTB_LINK_DOWN_TIMEOUT	10
 
 static int ntb_transport_probe(device_t dev);
 static int ntb_transport_attach(device_t dev);
 static int ntb_transport_detach(device_t dev);
 static void ntb_transport_init_queue(struct ntb_transport_ctx *nt,
     unsigned int qp_num);
 static int ntb_process_tx(struct ntb_transport_qp *qp,
     struct ntb_queue_entry *entry);
 static void ntb_transport_rxc_db(void *arg, int pending);
 static int ntb_process_rxc(struct ntb_transport_qp *qp);
 static void ntb_memcpy_rx(struct ntb_transport_qp *qp,
     struct ntb_queue_entry *entry, void *offset);
 static inline void ntb_rx_copy_callback(struct ntb_transport_qp *qp,
     void *data);
 static void ntb_complete_rxc(struct ntb_transport_qp *qp);
 static void ntb_transport_doorbell_callback(void *data, uint32_t vector);
 static void ntb_transport_event_callback(void *data);
 static void ntb_transport_link_work(void *arg);
 static int ntb_set_mw(struct ntb_transport_ctx *, int num_mw, size_t size);
 static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw);
 static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
     unsigned int qp_num);
 static void ntb_qp_link_work(void *arg);
 static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt);
 static void ntb_transport_link_cleanup_work(void *, int);
 static void ntb_qp_link_down(struct ntb_transport_qp *qp);
 static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp);
 static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp);
 static void ntb_send_link_down(struct ntb_transport_qp *qp);
 static void ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
     struct ntb_queue_list *list);
 static struct ntb_queue_entry *ntb_list_rm(struct mtx *lock,
     struct ntb_queue_list *list);
 static struct ntb_queue_entry *ntb_list_mv(struct mtx *lock,
     struct ntb_queue_list *from, struct ntb_queue_list *to);
 static void xeon_link_watchdog_hb(void *);
 
 static const struct ntb_ctx_ops ntb_transport_ops = {
 	.link_event = ntb_transport_event_callback,
 	.db_event = ntb_transport_doorbell_callback,
 };
 
 MALLOC_DEFINE(M_NTB_T, "ntb_transport", "ntb transport driver");
 
 static inline void
 iowrite32(uint32_t val, void *addr)
 {
 
 	bus_space_write_4(X86_BUS_SPACE_MEM, 0/* HACK */, (uintptr_t)addr,
 	    val);
 }
 
 /* Transport Init and teardown */
 
 static void
 xeon_link_watchdog_hb(void *arg)
 {
 	struct ntb_transport_ctx *nt;
 
 	nt = arg;
 	ntb_spad_write(nt->dev, NTBT_WATCHDOG_SPAD, 0);
 	callout_reset(&nt->link_watchdog, 1 * hz, xeon_link_watchdog_hb, nt);
 }
 
 static int
 ntb_transport_probe(device_t dev)
 {
 
 	device_set_desc(dev, "NTB Transport");
 	return (0);
 }
 
 static int
 ntb_transport_attach(device_t dev)
 {
 	struct ntb_transport_ctx *nt = device_get_softc(dev);
+	struct ntb_transport_child **cpp = &nt->child;
+	struct ntb_transport_child *nc;
 	struct ntb_transport_mw *mw;
-	uint64_t qp_bitmap;
-	int rc;
-	unsigned i;
+	uint64_t db_bitmap;
+	int rc, i, db_count, spad_count, qp, qpu, qpo, qpt;
+	char cfg[128] = "";
+	char buf[32];
+	char *n, *np, *c, *name;
 
 	nt->dev = dev;
 	nt->mw_count = ntb_mw_count(dev);
+	spad_count = ntb_spad_count(dev);
+	db_bitmap = ntb_db_valid_mask(dev);
+	db_count = flsll(db_bitmap);
+	KASSERT(db_bitmap == (1 << db_count) - 1,
+	    ("Doorbells are not sequential (%jx).\n", db_bitmap));
+
+	device_printf(dev, "%d memory windows, %d scratchpads, "
+	    "%d doorbells\n", nt->mw_count, spad_count, db_count);
+
+	if (nt->mw_count == 0) {
+		device_printf(dev, "At least 1 memory window required.\n");
+		return (ENXIO);
+	}
+	if (spad_count < 6) {
+		device_printf(dev, "At least 6 scratchpads required.\n");
+		return (ENXIO);
+	}
+	if (spad_count < 4 + 2 * nt->mw_count) {
+		nt->mw_count = (spad_count - 4) / 2;
+		device_printf(dev, "Scratchpads enough only for %d "
+		    "memory windows.\n", nt->mw_count);
+	}
+	if (db_bitmap == 0) {
+		device_printf(dev, "At least one doorbell required.\n");
+		return (ENXIO);
+	}
+
 	nt->mw_vec = malloc(nt->mw_count * sizeof(*nt->mw_vec), M_NTB_T,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < nt->mw_count; i++) {
 		mw = &nt->mw_vec[i];
 
 		rc = ntb_mw_get_range(dev, i, &mw->phys_addr, &mw->vbase,
 		    &mw->phys_size, &mw->xlat_align, &mw->xlat_align_size,
 		    &mw->addr_limit);
 		if (rc != 0)
 			goto err;
 
 		mw->buff_size = 0;
 		mw->xlat_size = 0;
 		mw->virt_addr = NULL;
 		mw->dma_addr = 0;
 
 		rc = ntb_mw_set_wc(dev, i, VM_MEMATTR_WRITE_COMBINING);
 		if (rc)
 			ntb_printf(0, "Unable to set mw%d caching\n", i);
 	}
 
-	qp_bitmap = ntb_db_valid_mask(dev);
-	nt->qp_count = flsll(qp_bitmap);
-	KASSERT(nt->qp_count != 0, ("bogus db bitmap"));
-	nt->qp_count -= 1;
+	qpu = 0;
+	qpo = imin(db_count, nt->mw_count);
+	qpt = db_count;
 
-	if (max_num_clients != 0 && max_num_clients < nt->qp_count)
-		nt->qp_count = max_num_clients;
-	else if (nt->mw_count < nt->qp_count)
-		nt->qp_count = nt->mw_count;
-	KASSERT(nt->qp_count <= QP_SETSIZE, ("invalid qp_count"));
+	snprintf(buf, sizeof(buf), "hint.%s.%d.config", device_get_name(dev),
+	    device_get_unit(dev));
+	TUNABLE_STR_FETCH(buf, cfg, sizeof(cfg));
+	n = cfg;
+	i = 0;
+	while ((c = strsep(&n, ",")) != NULL) {
+		np = c;
+		name = strsep(&np, ":");
+		if (name != NULL && name[0] == 0)
+			name = NULL;
+		qp = (np && np[0] != 0) ? strtol(np, NULL, 10) : qpo - qpu;
+		if (qp <= 0)
+			qp = 1;
 
+		if (qp > qpt - qpu) {
+			device_printf(dev, "Not enough resources for config\n");
+			break;
+		}
+
+		nc = malloc(sizeof(*nc), M_DEVBUF, M_WAITOK | M_ZERO);
+		nc->qpoff = qpu;
+		nc->qpcnt = qp;
+		nc->dev = device_add_child(dev, name, -1);
+		if (nc->dev == NULL) {
+			device_printf(dev, "Can not add child.\n");
+			break;
+		}
+		device_set_ivars(nc->dev, nc);
+		*cpp = nc;
+		cpp = &nc->next;
+
+		if (bootverbose) {
+			device_printf(dev, "%d \"%s\": queues %d",
+			    i, name, qpu);
+			if (qp > 1)
+				printf("-%d", qpu + qp - 1);
+			printf("\n");
+		}
+
+		qpu += qp;
+		i++;
+	}
+	nt->qp_count = qpu;
+
 	nt->qp_vec = malloc(nt->qp_count * sizeof(*nt->qp_vec), M_NTB_T,
 	    M_WAITOK | M_ZERO);
 
-	for (i = 0; i < nt->qp_count; i++) {
-		set_bit(i, &nt->qp_bitmap);
-		set_bit(i, &nt->qp_bitmap_free);
+	for (i = 0; i < nt->qp_count; i++)
 		ntb_transport_init_queue(nt, i);
-	}
 
 	callout_init(&nt->link_work, 0);
 	callout_init(&nt->link_watchdog, 0);
 	TASK_INIT(&nt->link_cleanup, 0, ntb_transport_link_cleanup_work, nt);
 
 	rc = ntb_set_ctx(dev, nt, &ntb_transport_ops);
 	if (rc != 0)
 		goto err;
 
 	nt->link_is_up = false;
 	ntb_link_enable(dev, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
 
 	if (enable_xeon_watchdog != 0)
 		callout_reset(&nt->link_watchdog, 0, xeon_link_watchdog_hb, nt);
 
-	/* Attach children to this transport */
-	device_add_child(dev, NULL, -1);
 	bus_generic_attach(dev);
-
 	return (0);
 
 err:
 	free(nt->qp_vec, M_NTB_T);
 	free(nt->mw_vec, M_NTB_T);
 	return (rc);
 }
 
 static int
 ntb_transport_detach(device_t dev)
 {
 	struct ntb_transport_ctx *nt = device_get_softc(dev);
-	struct _qpset qp_bitmap_alloc;
-	uint8_t i;
+	struct ntb_transport_child **cpp = &nt->child;
+	struct ntb_transport_child *nc;
+	int error = 0, i;
 
-	/* Detach & delete all children */
-	device_delete_children(dev);
+	while ((nc = *cpp) != NULL) {
+		*cpp = (*cpp)->next;
+		error = device_delete_child(dev, nc->dev);
+		if (error)
+			break;
+		free(nc, M_DEVBUF);
+	}
+	KASSERT(nt->qp_bitmap == 0,
+	    ("Some queues not freed on detach (%jx)", nt->qp_bitmap));
 
 	ntb_transport_link_cleanup(nt);
 	taskqueue_drain(taskqueue_swi, &nt->link_cleanup);
 	callout_drain(&nt->link_work);
 	callout_drain(&nt->link_watchdog);
 
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
-	BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
-	/* Verify that all the QPs are freed */
-	for (i = 0; i < nt->qp_count; i++)
-		if (test_bit(i, &qp_bitmap_alloc))
-			ntb_transport_free_queue(&nt->qp_vec[i]);
-
 	ntb_link_disable(dev);
 	ntb_clear_ctx(dev);
 
 	for (i = 0; i < nt->mw_count; i++)
 		ntb_free_mw(nt, i);
 
 	free(nt->qp_vec, M_NTB_T);
 	free(nt->mw_vec, M_NTB_T);
 	return (0);
 }
 
+int
+ntb_transport_queue_count(device_t dev)
+{
+	struct ntb_transport_child *nc = device_get_ivars(dev);
+
+	return (nc->qpcnt);
+}
+
 static void
 ntb_transport_init_queue(struct ntb_transport_ctx *nt, unsigned int qp_num)
 {
 	struct ntb_transport_mw *mw;
 	struct ntb_transport_qp *qp;
 	vm_paddr_t mw_base;
 	uint64_t mw_size, qp_offset;
 	size_t tx_size;
 	unsigned num_qps_mw, mw_num, mw_count;
 
 	mw_count = nt->mw_count;
 	mw_num = QP_TO_MW(nt, qp_num);
 	mw = &nt->mw_vec[mw_num];
 
 	qp = &nt->qp_vec[qp_num];
 	qp->qp_num = qp_num;
 	qp->transport = nt;
 	qp->dev = nt->dev;
 	qp->client_ready = false;
 	qp->event_handler = NULL;
 	ntb_qp_link_down_reset(qp);
 
 	if (mw_num < nt->qp_count % mw_count)
 		num_qps_mw = nt->qp_count / mw_count + 1;
 	else
 		num_qps_mw = nt->qp_count / mw_count;
 
 	mw_base = mw->phys_addr;
 	mw_size = mw->phys_size;
 
 	tx_size = mw_size / num_qps_mw;
 	qp_offset = tx_size * (qp_num / mw_count);
 
 	qp->tx_mw = mw->vbase + qp_offset;
 	KASSERT(qp->tx_mw != NULL, ("uh oh?"));
 
 	/* XXX Assumes that a vm_paddr_t is equivalent to bus_addr_t */
 	qp->tx_mw_phys = mw_base + qp_offset;
 	KASSERT(qp->tx_mw_phys != 0, ("uh oh?"));
 
 	tx_size -= sizeof(struct ntb_rx_info);
 	qp->rx_info = (void *)(qp->tx_mw + tx_size);
 
 	/* Due to house-keeping, there must be at least 2 buffs */
 	qp->tx_max_frame = qmin(transport_mtu, tx_size / 2);
 	qp->tx_max_entry = tx_size / qp->tx_max_frame;
 
 	callout_init(&qp->link_work, 0);
 	callout_init(&qp->rx_full, 1);
 
 	mtx_init(&qp->ntb_rx_q_lock, "ntb rx q", NULL, MTX_SPIN);
 	mtx_init(&qp->ntb_tx_free_q_lock, "ntb tx free q", NULL, MTX_SPIN);
 	mtx_init(&qp->tx_lock, "ntb transport tx", NULL, MTX_DEF);
 	TASK_INIT(&qp->rxc_db_work, 0, ntb_transport_rxc_db, qp);
 	qp->rxc_tq = taskqueue_create("ntbt_rx", M_WAITOK,
 	    taskqueue_thread_enqueue, &qp->rxc_tq);
 	taskqueue_start_threads(&qp->rxc_tq, 1, PI_NET, "%s rx%d",
 	    device_get_nameunit(nt->dev), qp_num);
 
 	STAILQ_INIT(&qp->rx_post_q);
 	STAILQ_INIT(&qp->rx_pend_q);
 	STAILQ_INIT(&qp->tx_free_q);
 }
 
 void
 ntb_transport_free_queue(struct ntb_transport_qp *qp)
 {
+	struct ntb_transport_ctx *nt = qp->transport;
 	struct ntb_queue_entry *entry;
 
-	if (qp == NULL)
-		return;
-
 	callout_drain(&qp->link_work);
 
 	ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
 	taskqueue_drain_all(qp->rxc_tq);
 	taskqueue_free(qp->rxc_tq);
 
 	qp->cb_data = NULL;
 	qp->rx_handler = NULL;
 	qp->tx_handler = NULL;
 	qp->event_handler = NULL;
 
 	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_pend_q)))
 		free(entry, M_NTB_T);
 
 	while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_post_q)))
 		free(entry, M_NTB_T);
 
 	while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
 		free(entry, M_NTB_T);
 
-	set_bit(qp->qp_num, &qp->transport->qp_bitmap_free);
+	nt->qp_bitmap &= ~(1 << qp->qp_num);
 }
 
 /**
  * ntb_transport_create_queue - Create a new NTB transport layer queue
  * @rx_handler: receive callback function
  * @tx_handler: transmit callback function
  * @event_handler: event callback function
  *
  * Create a new NTB transport layer queue and provide the queue with a callback
  * routine for both transmit and receive.  The receive callback routine will be
  * used to pass up data when the transport has received it on the queue.   The
  * transmit callback routine will be called when the transport has completed the
  * transmission of the data on the queue and the data is ready to be freed.
  *
  * RETURNS: pointer to newly created ntb_queue, NULL on error.
  */
 struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, device_t dev,
-    const struct ntb_queue_handlers *handlers)
+ntb_transport_create_queue(device_t dev, int q,
+    const struct ntb_queue_handlers *handlers, void *data)
 {
-	struct ntb_transport_ctx *nt = device_get_softc(dev);
+	struct ntb_transport_child *nc = device_get_ivars(dev);
+	struct ntb_transport_ctx *nt = device_get_softc(device_get_parent(dev));
 	struct ntb_queue_entry *entry;
 	struct ntb_transport_qp *qp;
-	unsigned int free_queue;
 	int i;
 
-	free_queue = ffs_bit(&nt->qp_bitmap_free);
-	if (free_queue == 0)
+	if (q < 0 || q >= nc->qpcnt)
 		return (NULL);
 
-	/* decrement free_queue to make it zero based */
-	free_queue--;
-
-	qp = &nt->qp_vec[free_queue];
-	clear_bit(qp->qp_num, &nt->qp_bitmap_free);
+	qp = &nt->qp_vec[nc->qpoff + q];
+	nt->qp_bitmap |= (1 << qp->qp_num);
 	qp->cb_data = data;
 	qp->rx_handler = handlers->rx_handler;
 	qp->tx_handler = handlers->tx_handler;
 	qp->event_handler = handlers->event_handler;
 
 	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
 		entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
 		entry->cb_data = data;
 		entry->buf = NULL;
 		entry->len = transport_mtu;
 		entry->qp = qp;
 		ntb_list_add(&qp->ntb_rx_q_lock, entry, &qp->rx_pend_q);
 	}
 
 	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
 		entry = malloc(sizeof(*entry), M_NTB_T, M_WAITOK | M_ZERO);
 		entry->qp = qp;
 		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
 	}
 
 	ntb_db_clear(dev, 1ull << qp->qp_num);
 	return (qp);
 }
 
 /**
  * ntb_transport_link_up - Notify NTB transport of client readiness to use queue
  * @qp: NTB transport layer queue to be enabled
  *
  * Notify NTB transport layer of client readiness to use queue
  */
 void
 ntb_transport_link_up(struct ntb_transport_qp *qp)
 {
 	struct ntb_transport_ctx *nt = qp->transport;
 
 	qp->client_ready = true;
 
 	ntb_printf(2, "qp %d client ready\n", qp->qp_num);
 
 	if (nt->link_is_up)
 		callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
 }
 
 
 
 /* Transport Tx */
 
 /**
  * ntb_transport_tx_enqueue - Enqueue a new NTB queue entry
  * @qp: NTB transport layer queue the entry is to be enqueued on
  * @cb: per buffer pointer for callback function to use
  * @data: pointer to data buffer that will be sent
  * @len: length of the data buffer
  *
  * Enqueue a new transmit buffer onto the transport queue from which a NTB
  * payload will be transmitted.  This assumes that a lock is being held to
  * serialize access to the qp.
  *
  * RETURNS: An appropriate ERRNO error value on error, or zero for success.
  */
 int
 ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
     unsigned int len)
 {
 	struct ntb_queue_entry *entry;
 	int rc;
 
-	if (qp == NULL || !qp->link_is_up || len == 0) {
+	if (!qp->link_is_up || len == 0) {
 		CTR0(KTR_NTB, "TX: link not up");
 		return (EINVAL);
 	}
 
 	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
 	if (entry == NULL) {
 		CTR0(KTR_NTB, "TX: could not get entry from tx_free_q");
 		qp->tx_err_no_buf++;
 		return (EBUSY);
 	}
 	CTR1(KTR_NTB, "TX: got entry %p from tx_free_q", entry);
 
 	entry->cb_data = cb;
 	entry->buf = data;
 	entry->len = len;
 	entry->flags = 0;
 
 	mtx_lock(&qp->tx_lock);
 	rc = ntb_process_tx(qp, entry);
 	mtx_unlock(&qp->tx_lock);
 	if (rc != 0) {
 		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
 		CTR1(KTR_NTB,
 		    "TX: process_tx failed. Returning entry %p to tx_free_q",
 		    entry);
 	}
 	return (rc);
 }
 
 static void
 ntb_tx_copy_callback(void *data)
 {
 	struct ntb_queue_entry *entry = data;
 	struct ntb_transport_qp *qp = entry->qp;
 	struct ntb_payload_header *hdr = entry->x_hdr;
 
 	iowrite32(entry->flags | NTBT_DESC_DONE_FLAG, &hdr->flags);
 	CTR1(KTR_NTB, "TX: hdr %p set DESC_DONE", hdr);
 
 	ntb_peer_db_set(qp->dev, 1ull << qp->qp_num);
 
 	/*
 	 * The entry length can only be zero if the packet is intended to be a
 	 * "link down" or similar.  Since no payload is being sent in these
 	 * cases, there is nothing to add to the completion queue.
 	 */
 	if (entry->len > 0) {
 		qp->tx_bytes += entry->len;
 
 		if (qp->tx_handler)
 			qp->tx_handler(qp, qp->cb_data, entry->buf,
 			    entry->len);
 		else
 			m_freem(entry->buf);
 		entry->buf = NULL;
 	}
 
 	CTR3(KTR_NTB,
 	    "TX: entry %p sent. hdr->ver = %u, hdr->flags = 0x%x, Returning "
 	    "to tx_free_q", entry, hdr->ver, hdr->flags);
 	ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
 }
 
 static void
 ntb_memcpy_tx(struct ntb_queue_entry *entry, void *offset)
 {
 
 	CTR2(KTR_NTB, "TX: copying %d bytes to offset %p", entry->len, offset);
 	if (entry->buf != NULL) {
 		m_copydata((struct mbuf *)entry->buf, 0, entry->len, offset);
 
 		/*
 		 * Ensure that the data is fully copied before setting the
 		 * flags
 		 */
 		wmb();
 	}
 
 	ntb_tx_copy_callback(entry);
 }
 
 static void
 ntb_async_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
 {
 	struct ntb_payload_header *hdr;
 	void *offset;
 
 	offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
 	hdr = (struct ntb_payload_header *)((char *)offset + qp->tx_max_frame -
 	    sizeof(struct ntb_payload_header));
 	entry->x_hdr = hdr;
 
 	iowrite32(entry->len, &hdr->len);
 	iowrite32(qp->tx_pkts, &hdr->ver);
 
 	ntb_memcpy_tx(entry, offset);
 }
 
 static int
 ntb_process_tx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry)
 {
 
 	CTR3(KTR_NTB,
 	    "TX: process_tx: tx_pkts=%lu, tx_index=%u, remote entry=%u",
 	    qp->tx_pkts, qp->tx_index, qp->remote_rx_info->entry);
 	if (qp->tx_index == qp->remote_rx_info->entry) {
 		CTR0(KTR_NTB, "TX: ring full");
 		qp->tx_ring_full++;
 		return (EAGAIN);
 	}
 
 	if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
 		if (qp->tx_handler != NULL)
 			qp->tx_handler(qp, qp->cb_data, entry->buf,
 			    EIO);
 		else
 			m_freem(entry->buf);
 
 		entry->buf = NULL;
 		ntb_list_add(&qp->ntb_tx_free_q_lock, entry, &qp->tx_free_q);
 		CTR1(KTR_NTB,
 		    "TX: frame too big. returning entry %p to tx_free_q",
 		    entry);
 		return (0);
 	}
 	CTR2(KTR_NTB, "TX: copying entry %p to index %u", entry, qp->tx_index);
 	ntb_async_tx(qp, entry);
 
 	qp->tx_index++;
 	qp->tx_index %= qp->tx_max_entry;
 
 	qp->tx_pkts++;
 
 	return (0);
 }
 
 /* Transport Rx */
 static void
 ntb_transport_rxc_db(void *arg, int pending __unused)
 {
 	struct ntb_transport_qp *qp = arg;
 	int rc;
 
 	CTR0(KTR_NTB, "RX: transport_rx");
 again:
 	while ((rc = ntb_process_rxc(qp)) == 0)
 		;
 	CTR1(KTR_NTB, "RX: process_rxc returned %d", rc);
 
 	if ((ntb_db_read(qp->dev) & (1ull << qp->qp_num)) != 0) {
 		/* If db is set, clear it and check queue once more. */
 		ntb_db_clear(qp->dev, 1ull << qp->qp_num);
 		goto again;
 	}
 }
 
 static int
 ntb_process_rxc(struct ntb_transport_qp *qp)
 {
 	struct ntb_payload_header *hdr;
 	struct ntb_queue_entry *entry;
 	caddr_t offset;
 
 	offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
 	hdr = (void *)(offset + qp->rx_max_frame -
 	    sizeof(struct ntb_payload_header));
 
 	CTR1(KTR_NTB, "RX: process_rxc rx_index = %u", qp->rx_index);
 	if ((hdr->flags & NTBT_DESC_DONE_FLAG) == 0) {
 		CTR0(KTR_NTB, "RX: hdr not done");
 		qp->rx_ring_empty++;
 		return (EAGAIN);
 	}
 
 	if ((hdr->flags & NTBT_LINK_DOWN_FLAG) != 0) {
 		CTR0(KTR_NTB, "RX: link down");
 		ntb_qp_link_down(qp);
 		hdr->flags = 0;
 		return (EAGAIN);
 	}
 
 	if (hdr->ver != (uint32_t)qp->rx_pkts) {
 		CTR2(KTR_NTB,"RX: ver != rx_pkts (%x != %lx). "
 		    "Returning entry to rx_pend_q", hdr->ver, qp->rx_pkts);
 		qp->rx_err_ver++;
 		return (EIO);
 	}
 
 	entry = ntb_list_mv(&qp->ntb_rx_q_lock, &qp->rx_pend_q, &qp->rx_post_q);
 	if (entry == NULL) {
 		qp->rx_err_no_buf++;
 		CTR0(KTR_NTB, "RX: No entries in rx_pend_q");
 		return (EAGAIN);
 	}
 	callout_stop(&qp->rx_full);
 	CTR1(KTR_NTB, "RX: rx entry %p from rx_pend_q", entry);
 
 	entry->x_hdr = hdr;
 	entry->index = qp->rx_index;
 
 	if (hdr->len > entry->len) {
 		CTR2(KTR_NTB, "RX: len too long. Wanted %ju got %ju",
 		    (uintmax_t)hdr->len, (uintmax_t)entry->len);
 		qp->rx_err_oflow++;
 
 		entry->len = -EIO;
 		entry->flags |= NTBT_DESC_DONE_FLAG;
 
 		ntb_complete_rxc(qp);
 	} else {
 		qp->rx_bytes += hdr->len;
 		qp->rx_pkts++;
 
 		CTR1(KTR_NTB, "RX: received %ld rx_pkts", qp->rx_pkts);
 
 		entry->len = hdr->len;
 
 		ntb_memcpy_rx(qp, entry, offset);
 	}
 
 	qp->rx_index++;
 	qp->rx_index %= qp->rx_max_entry;
 	return (0);
 }
 
 static void
 ntb_memcpy_rx(struct ntb_transport_qp *qp, struct ntb_queue_entry *entry,
     void *offset)
 {
 	struct ifnet *ifp = entry->cb_data;
 	unsigned int len = entry->len;
 
 	CTR2(KTR_NTB, "RX: copying %d bytes from offset %p", len, offset);
 
 	entry->buf = (void *)m_devget(offset, len, 0, ifp, NULL);
 	if (entry->buf == NULL)
 		entry->len = -ENOMEM;
 
 	/* Ensure that the data is globally visible before clearing the flag */
 	wmb();
 
 	CTR2(KTR_NTB, "RX: copied entry %p to mbuf %p.", entry, entry->buf);
 	ntb_rx_copy_callback(qp, entry);
 }
 
 static inline void
 ntb_rx_copy_callback(struct ntb_transport_qp *qp, void *data)
 {
 	struct ntb_queue_entry *entry;
 
 	entry = data;
 	entry->flags |= NTBT_DESC_DONE_FLAG;
 	ntb_complete_rxc(qp);
 }
 
 static void
 ntb_complete_rxc(struct ntb_transport_qp *qp)
 {
 	struct ntb_queue_entry *entry;
 	struct mbuf *m;
 	unsigned len;
 
 	CTR0(KTR_NTB, "RX: rx_completion_task");
 
 	mtx_lock_spin(&qp->ntb_rx_q_lock);
 
 	while (!STAILQ_EMPTY(&qp->rx_post_q)) {
 		entry = STAILQ_FIRST(&qp->rx_post_q);
 		if ((entry->flags & NTBT_DESC_DONE_FLAG) == 0)
 			break;
 
 		entry->x_hdr->flags = 0;
 		iowrite32(entry->index, &qp->rx_info->entry);
 
 		STAILQ_REMOVE_HEAD(&qp->rx_post_q, entry);
 
 		len = entry->len;
 		m = entry->buf;
 
 		/*
 		 * Re-initialize queue_entry for reuse; rx_handler takes
 		 * ownership of the mbuf.
 		 */
 		entry->buf = NULL;
 		entry->len = transport_mtu;
 		entry->cb_data = qp->cb_data;
 
 		STAILQ_INSERT_TAIL(&qp->rx_pend_q, entry, entry);
 
 		mtx_unlock_spin(&qp->ntb_rx_q_lock);
 
 		CTR2(KTR_NTB, "RX: completing entry %p, mbuf %p", entry, m);
 		if (qp->rx_handler != NULL && qp->client_ready)
 			qp->rx_handler(qp, qp->cb_data, m, len);
 		else
 			m_freem(m);
 
 		mtx_lock_spin(&qp->ntb_rx_q_lock);
 	}
 
 	mtx_unlock_spin(&qp->ntb_rx_q_lock);
 }
 
 static void
 ntb_transport_doorbell_callback(void *data, uint32_t vector)
 {
 	struct ntb_transport_ctx *nt = data;
 	struct ntb_transport_qp *qp;
-	struct _qpset db_bits;
 	uint64_t vec_mask;
 	unsigned qp_num;
 
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &db_bits);
-	BIT_NAND(QP_SETSIZE, &db_bits, &nt->qp_bitmap_free);
-
 	vec_mask = ntb_db_vector_mask(nt->dev, vector);
+	vec_mask &= nt->qp_bitmap;
 	if ((vec_mask & (vec_mask - 1)) != 0)
 		vec_mask &= ntb_db_read(nt->dev);
 	while (vec_mask != 0) {
 		qp_num = ffsll(vec_mask) - 1;
 
-		if (test_bit(qp_num, &db_bits)) {
-			qp = &nt->qp_vec[qp_num];
-			if (qp->link_is_up)
-				taskqueue_enqueue(qp->rxc_tq, &qp->rxc_db_work);
-		}
+		qp = &nt->qp_vec[qp_num];
+		if (qp->link_is_up)
+			taskqueue_enqueue(qp->rxc_tq, &qp->rxc_db_work);
 
 		vec_mask &= ~(1ull << qp_num);
 	}
 }
 
 /* Link Event handler */
 static void
 ntb_transport_event_callback(void *data)
 {
 	struct ntb_transport_ctx *nt = data;
 
 	if (ntb_link_is_up(nt->dev, NULL, NULL)) {
 		ntb_printf(1, "HW link up\n");
 		callout_reset(&nt->link_work, 0, ntb_transport_link_work, nt);
 	} else {
 		ntb_printf(1, "HW link down\n");
 		taskqueue_enqueue(taskqueue_swi, &nt->link_cleanup);
 	}
 }
 
 /* Link bring up */
 static void
 ntb_transport_link_work(void *arg)
 {
 	struct ntb_transport_ctx *nt = arg;
 	device_t dev = nt->dev;
 	struct ntb_transport_qp *qp;
 	uint64_t val64, size;
 	uint32_t val;
 	unsigned i;
 	int rc;
 
 	/* send the local info, in the opposite order of the way we read it */
 	for (i = 0; i < nt->mw_count; i++) {
 		size = nt->mw_vec[i].phys_size;
 
 		if (max_mw_size != 0 && size > max_mw_size)
 			size = max_mw_size;
 
 		ntb_peer_spad_write(dev, NTBT_MW0_SZ_HIGH + (i * 2),
 		    size >> 32);
 		ntb_peer_spad_write(dev, NTBT_MW0_SZ_LOW + (i * 2), size);
 	}
-
 	ntb_peer_spad_write(dev, NTBT_NUM_MWS, nt->mw_count);
-
 	ntb_peer_spad_write(dev, NTBT_NUM_QPS, nt->qp_count);
-
+	ntb_peer_spad_write(dev, NTBT_QP_LINKS, 0);
 	ntb_peer_spad_write(dev, NTBT_VERSION, NTB_TRANSPORT_VERSION);
 
 	/* Query the remote side for its info */
 	val = 0;
 	ntb_spad_read(dev, NTBT_VERSION, &val);
 	if (val != NTB_TRANSPORT_VERSION)
 		goto out;
 
 	ntb_spad_read(dev, NTBT_NUM_QPS, &val);
 	if (val != nt->qp_count)
 		goto out;
 
 	ntb_spad_read(dev, NTBT_NUM_MWS, &val);
 	if (val != nt->mw_count)
 		goto out;
 
 	for (i = 0; i < nt->mw_count; i++) {
 		ntb_spad_read(dev, NTBT_MW0_SZ_HIGH + (i * 2), &val);
 		val64 = (uint64_t)val << 32;
 
 		ntb_spad_read(dev, NTBT_MW0_SZ_LOW + (i * 2), &val);
 		val64 |= val;
 
 		rc = ntb_set_mw(nt, i, val64);
 		if (rc != 0)
 			goto free_mws;
 	}
 
 	nt->link_is_up = true;
 	ntb_printf(1, "transport link up\n");
 
 	for (i = 0; i < nt->qp_count; i++) {
 		qp = &nt->qp_vec[i];
 
 		ntb_transport_setup_qp_mw(nt, i);
 
 		if (qp->client_ready)
 			callout_reset(&qp->link_work, 0, ntb_qp_link_work, qp);
 	}
 
 	return;
 
 free_mws:
 	for (i = 0; i < nt->mw_count; i++)
 		ntb_free_mw(nt, i);
 out:
 	if (ntb_link_is_up(dev, NULL, NULL))
 		callout_reset(&nt->link_work,
 		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_transport_link_work, nt);
 }
 
 static int
 ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, size_t size)
 {
 	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
 	size_t xlat_size, buff_size;
 	int rc;
 
 	if (size == 0)
 		return (EINVAL);
 
 	xlat_size = roundup(size, mw->xlat_align_size);
 	buff_size = xlat_size;
 
 	/* No need to re-setup */
 	if (mw->xlat_size == xlat_size)
 		return (0);
 
 	if (mw->buff_size != 0)
 		ntb_free_mw(nt, num_mw);
 
 	/* Alloc memory for receiving data.  Must be aligned */
 	mw->xlat_size = xlat_size;
 	mw->buff_size = buff_size;
 
 	mw->virt_addr = contigmalloc(mw->buff_size, M_NTB_T, M_ZERO, 0,
 	    mw->addr_limit, mw->xlat_align, 0);
 	if (mw->virt_addr == NULL) {
 		ntb_printf(0, "Unable to allocate MW buffer of size %zu/%zu\n",
 		    mw->buff_size, mw->xlat_size);
 		mw->xlat_size = 0;
 		mw->buff_size = 0;
 		return (ENOMEM);
 	}
 	/* TODO: replace with bus_space_* functions */
 	mw->dma_addr = vtophys(mw->virt_addr);
 
 	/*
 	 * Ensure that the allocation from contigmalloc is aligned as
 	 * requested.  XXX: This may not be needed -- brought in for parity
 	 * with the Linux driver.
 	 */
 	if (mw->dma_addr % mw->xlat_align != 0) {
 		ntb_printf(0,
 		    "DMA memory 0x%jx not aligned to BAR size 0x%zx\n",
 		    (uintmax_t)mw->dma_addr, size);
 		ntb_free_mw(nt, num_mw);
 		return (ENOMEM);
 	}
 
 	/* Notify HW the memory location of the receive buffer */
 	rc = ntb_mw_set_trans(nt->dev, num_mw, mw->dma_addr, mw->xlat_size);
 	if (rc) {
 		ntb_printf(0, "Unable to set mw%d translation\n", num_mw);
 		ntb_free_mw(nt, num_mw);
 		return (rc);
 	}
 
 	return (0);
 }
 
 static void
 ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
 {
 	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
 
 	if (mw->virt_addr == NULL)
 		return;
 
 	ntb_mw_clear_trans(nt->dev, num_mw);
 	contigfree(mw->virt_addr, mw->xlat_size, M_NTB_T);
 	mw->xlat_size = 0;
 	mw->buff_size = 0;
 	mw->virt_addr = NULL;
 }
 
 static int
 ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, unsigned int qp_num)
 {
 	struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
 	struct ntb_transport_mw *mw;
 	void *offset;
 	ntb_q_idx_t i;
 	size_t rx_size;
 	unsigned num_qps_mw, mw_num, mw_count;
 
 	mw_count = nt->mw_count;
 	mw_num = QP_TO_MW(nt, qp_num);
 	mw = &nt->mw_vec[mw_num];
 
 	if (mw->virt_addr == NULL)
 		return (ENOMEM);
 
 	if (mw_num < nt->qp_count % mw_count)
 		num_qps_mw = nt->qp_count / mw_count + 1;
 	else
 		num_qps_mw = nt->qp_count / mw_count;
 
 	rx_size = mw->xlat_size / num_qps_mw;
 	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
 	rx_size -= sizeof(struct ntb_rx_info);
 
 	qp->remote_rx_info = (void*)(qp->rx_buff + rx_size);
 
 	/* Due to house-keeping, there must be at least 2 buffs */
 	qp->rx_max_frame = qmin(transport_mtu, rx_size / 2);
 	qp->rx_max_entry = rx_size / qp->rx_max_frame;
 	qp->rx_index = 0;
 
 	qp->remote_rx_info->entry = qp->rx_max_entry - 1;
 
 	/* Set up the hdr offsets with 0s */
 	for (i = 0; i < qp->rx_max_entry; i++) {
 		offset = (void *)(qp->rx_buff + qp->rx_max_frame * (i + 1) -
 		    sizeof(struct ntb_payload_header));
 		memset(offset, 0, sizeof(struct ntb_payload_header));
 	}
 
 	qp->rx_pkts = 0;
 	qp->tx_pkts = 0;
 	qp->tx_index = 0;
 
 	return (0);
 }
 
 static void
 ntb_qp_link_work(void *arg)
 {
 	struct ntb_transport_qp *qp = arg;
 	device_t dev = qp->dev;
 	struct ntb_transport_ctx *nt = qp->transport;
-	uint32_t val, dummy;
+	int i;
+	uint32_t val;
 
-	ntb_spad_read(dev, NTBT_QP_LINKS, &val);
+	/* Report queues that are up on our side */
+	for (i = 0, val = 0; i < nt->qp_count; i++) {
+		if (nt->qp_vec[i].client_ready)
+			val |= (1 << i);
+	}
+	ntb_peer_spad_write(dev, NTBT_QP_LINKS, val);
 
-	ntb_peer_spad_write(dev, NTBT_QP_LINKS, val | (1ull << qp->qp_num));
-
-	/* query remote spad for qp ready bits */
-	ntb_peer_spad_read(dev, NTBT_QP_LINKS, &dummy);
-
 	/* See if the remote side is up */
+	ntb_spad_read(dev, NTBT_QP_LINKS, &val);
 	if ((val & (1ull << qp->qp_num)) != 0) {
 		ntb_printf(2, "qp %d link up\n", qp->qp_num);
 		qp->link_is_up = true;
 
 		if (qp->event_handler != NULL)
 			qp->event_handler(qp->cb_data, NTB_LINK_UP);
 
 		ntb_db_clear_mask(dev, 1ull << qp->qp_num);
 	} else if (nt->link_is_up)
 		callout_reset(&qp->link_work,
 		    NTB_LINK_DOWN_TIMEOUT * hz / 1000, ntb_qp_link_work, qp);
 }
 
 /* Link down event*/
 static void
 ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
 {
 	struct ntb_transport_qp *qp;
-	struct _qpset qp_bitmap_alloc;
-	unsigned i;
+	int i;
 
-	BIT_COPY(QP_SETSIZE, &nt->qp_bitmap, &qp_bitmap_alloc);
-	BIT_NAND(QP_SETSIZE, &qp_bitmap_alloc, &nt->qp_bitmap_free);
-
 	/* Pass along the info to any clients */
-	for (i = 0; i < nt->qp_count; i++)
-		if (test_bit(i, &qp_bitmap_alloc)) {
+	for (i = 0; i < nt->qp_count; i++) {
+		if ((nt->qp_bitmap & (1 << i)) != 0) {
 			qp = &nt->qp_vec[i];
 			ntb_qp_link_cleanup(qp);
 			callout_drain(&qp->link_work);
 		}
+	}
 
 	if (!nt->link_is_up)
 		callout_drain(&nt->link_work);
 
 	/*
 	 * The scratchpad registers keep the values if the remote side
 	 * goes down, blast them now to give them a sane value the next
 	 * time they are accessed
 	 */
-	for (i = 0; i < NTBT_MAX_SPAD; i++)
-		ntb_spad_write(nt->dev, i, 0);
+	ntb_spad_clear(nt->dev);
 }
 
 static void
 ntb_transport_link_cleanup_work(void *arg, int pending __unused)
 {
 
 	ntb_transport_link_cleanup(arg);
 }
 
 static void
 ntb_qp_link_down(struct ntb_transport_qp *qp)
 {
 
 	ntb_qp_link_cleanup(qp);
 }
 
 static void
 ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
 {
 
 	qp->link_is_up = false;
 	ntb_db_set_mask(qp->dev, 1ull << qp->qp_num);
 
 	qp->tx_index = qp->rx_index = 0;
 	qp->tx_bytes = qp->rx_bytes = 0;
 	qp->tx_pkts = qp->rx_pkts = 0;
 
 	qp->rx_ring_empty = 0;
 	qp->tx_ring_full = 0;
 
 	qp->rx_err_no_buf = qp->tx_err_no_buf = 0;
 	qp->rx_err_oflow = qp->rx_err_ver = 0;
 }
 
 static void
 ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
 {
 
 	callout_drain(&qp->link_work);
 	ntb_qp_link_down_reset(qp);
 
 	if (qp->event_handler != NULL)
 		qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
 }
 
 /* Link commanded down */
 /**
  * ntb_transport_link_down - Notify NTB transport to no longer enqueue data
  * @qp: NTB transport layer queue to be disabled
  *
  * Notify NTB transport layer of client's desire to no longer receive data on
  * transport queue specified.  It is the client's responsibility to ensure all
  * entries on queue are purged or otherwise handled appropriately.
  */
 void
 ntb_transport_link_down(struct ntb_transport_qp *qp)
 {
+	struct ntb_transport_ctx *nt = qp->transport;
+	int i;
 	uint32_t val;
 
-	if (qp == NULL)
-		return;
-
 	qp->client_ready = false;
+	for (i = 0, val = 0; i < nt->qp_count; i++) {
+		if (nt->qp_vec[i].client_ready)
+			val |= (1 << i);
+	}
+	ntb_peer_spad_write(qp->dev, NTBT_QP_LINKS, val);
 
-	ntb_spad_read(qp->dev, NTBT_QP_LINKS, &val);
-
-	ntb_peer_spad_write(qp->dev, NTBT_QP_LINKS,
-	   val & ~(1 << qp->qp_num));
-
 	if (qp->link_is_up)
 		ntb_send_link_down(qp);
 	else
 		callout_drain(&qp->link_work);
 }
 
 /**
  * ntb_transport_link_query - Query transport link state
  * @qp: NTB transport layer queue to be queried
  *
  * Query connectivity to the remote system of the NTB transport queue
  *
  * RETURNS: true for link up or false for link down
  */
 bool
 ntb_transport_link_query(struct ntb_transport_qp *qp)
 {
-	if (qp == NULL)
-		return (false);
 
 	return (qp->link_is_up);
 }
 
 static void
 ntb_send_link_down(struct ntb_transport_qp *qp)
 {
 	struct ntb_queue_entry *entry;
 	int i, rc;
 
 	if (!qp->link_is_up)
 		return;
 
 	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
 		entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
 		if (entry != NULL)
 			break;
 		pause("NTB Wait for link down", hz / 10);
 	}
 
 	if (entry == NULL)
 		return;
 
 	entry->cb_data = NULL;
 	entry->buf = NULL;
 	entry->len = 0;
 	entry->flags = NTBT_LINK_DOWN_FLAG;
 
 	mtx_lock(&qp->tx_lock);
 	rc = ntb_process_tx(qp, entry);
 	mtx_unlock(&qp->tx_lock);
 	if (rc != 0)
 		printf("ntb: Failed to send link down\n");
 
 	ntb_qp_link_down_reset(qp);
 }
 
 
 /* List Management */
 
 static void
 ntb_list_add(struct mtx *lock, struct ntb_queue_entry *entry,
     struct ntb_queue_list *list)
 {
 
 	mtx_lock_spin(lock);
 	STAILQ_INSERT_TAIL(list, entry, entry);
 	mtx_unlock_spin(lock);
 }
 
 static struct ntb_queue_entry *
 ntb_list_rm(struct mtx *lock, struct ntb_queue_list *list)
 {
 	struct ntb_queue_entry *entry;
 
 	mtx_lock_spin(lock);
 	if (STAILQ_EMPTY(list)) {
 		entry = NULL;
 		goto out;
 	}
 	entry = STAILQ_FIRST(list);
 	STAILQ_REMOVE_HEAD(list, entry);
 out:
 	mtx_unlock_spin(lock);
 
 	return (entry);
 }
 
 static struct ntb_queue_entry *
 ntb_list_mv(struct mtx *lock, struct ntb_queue_list *from,
     struct ntb_queue_list *to)
 {
 	struct ntb_queue_entry *entry;
 
 	mtx_lock_spin(lock);
 	if (STAILQ_EMPTY(from)) {
 		entry = NULL;
 		goto out;
 	}
 	entry = STAILQ_FIRST(from);
 	STAILQ_REMOVE_HEAD(from, entry);
 	STAILQ_INSERT_TAIL(to, entry, entry);
 
 out:
 	mtx_unlock_spin(lock);
 	return (entry);
 }
 
 /**
  * ntb_transport_qp_num - Query the qp number
  * @qp: NTB transport layer queue to be queried
  *
  * Query qp number of the NTB transport queue
  *
  * RETURNS: a zero based number specifying the qp number
  */
 unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp)
 {
-	if (qp == NULL)
-		return 0;
 
 	return (qp->qp_num);
 }
 
 /**
  * ntb_transport_max_size - Query the max payload size of a qp
  * @qp: NTB transport layer queue to be queried
  *
  * Query the maximum payload size permissible on the given qp
  *
  * RETURNS: the max payload size of a qp
  */
 unsigned int
 ntb_transport_max_size(struct ntb_transport_qp *qp)
 {
-
-	if (qp == NULL)
-		return (0);
 
 	return (qp->tx_max_frame - sizeof(struct ntb_payload_header));
 }
 
 unsigned int
 ntb_transport_tx_free_entry(struct ntb_transport_qp *qp)
 {
 	unsigned int head = qp->tx_index;
 	unsigned int tail = qp->remote_rx_info->entry;
 
 	return (tail >= head ? tail - head : qp->tx_max_entry + tail - head);
 }
 
 static device_method_t ntb_transport_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,     ntb_transport_probe),
 	DEVMETHOD(device_attach,    ntb_transport_attach),
 	DEVMETHOD(device_detach,    ntb_transport_detach),
 	DEVMETHOD_END
 };
 
 devclass_t ntb_transport_devclass;
 static DEFINE_CLASS_0(ntb_transport, ntb_transport_driver,
     ntb_transport_methods, sizeof(struct ntb_transport_ctx));
 DRIVER_MODULE(ntb_transport, ntb_hw, ntb_transport_driver,
     ntb_transport_devclass, NULL, NULL);
 MODULE_DEPEND(ntb_transport, ntb, 1, 1, 1);
 MODULE_VERSION(ntb_transport, 1);
Index: user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_transport.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_transport.h	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/dev/ntb/ntb_transport.h	(revision 303517)
@@ -1,60 +1,61 @@
 /*-
  * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 struct ntb_transport_qp;
 
 extern devclass_t ntb_transport_devclass;
 
 enum ntb_link_event {
 	NTB_LINK_DOWN = 0,
 	NTB_LINK_UP,
 };
 
 struct ntb_queue_handlers {
 	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
 	    void *data, int len);
 	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
 	    void *data, int len);
 	void (*event_handler)(void *data, enum ntb_link_event status);
 };
 
-unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
-unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
+int ntb_transport_queue_count(device_t dev);
 struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, device_t dev,
-			   const struct ntb_queue_handlers *handlers);
+ntb_transport_create_queue(device_t dev, int q,
+    const struct ntb_queue_handlers *handlers, void *data);
 void ntb_transport_free_queue(struct ntb_transport_qp *qp);
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
 int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
 			     unsigned int len);
 int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
 			     unsigned int len);
 void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
 void ntb_transport_link_up(struct ntb_transport_qp *qp);
 void ntb_transport_link_down(struct ntb_transport_qp *qp);
 bool ntb_transport_link_query(struct ntb_transport_qp *qp);
 unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp);
Index: user/alc/PQ_LAUNDRY/sys/dev/pci/pci_pci.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/pci/pci_pci.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/dev/pci/pci_pci.c	(revision 303517)
@@ -1,2799 +1,2807 @@
 /*-
  * Copyright (c) 1994,1995 Stefan Esser, Wolfgang StanglMeier
  * Copyright (c) 2000 Michael Smith <msmith@freebsd.org>
  * Copyright (c) 2000 BSDi
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * PCI:PCI bridge support.
  */
 
 #include "opt_pci.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pci_private.h>
 #include <dev/pci/pcib_private.h>
 
 #include "pcib_if.h"
 
 static int		pcib_probe(device_t dev);
 static int		pcib_suspend(device_t dev);
 static int		pcib_resume(device_t dev);
 static int		pcib_power_for_sleep(device_t pcib, device_t dev,
 			    int *pstate);
 static int		pcib_ari_get_id(device_t pcib, device_t dev,
     enum pci_id_type type, uintptr_t *id);
 static uint32_t		pcib_read_config(device_t dev, u_int b, u_int s,
     u_int f, u_int reg, int width);
 static void		pcib_write_config(device_t dev, u_int b, u_int s,
     u_int f, u_int reg, uint32_t val, int width);
 static int		pcib_ari_maxslots(device_t dev);
 static int		pcib_ari_maxfuncs(device_t dev);
 static int		pcib_try_enable_ari(device_t pcib, device_t dev);
 static int		pcib_ari_enabled(device_t pcib);
 static void		pcib_ari_decode_rid(device_t pcib, uint16_t rid,
 			    int *bus, int *slot, int *func);
 #ifdef PCI_HP
 static void		pcib_pcie_ab_timeout(void *arg);
 static void		pcib_pcie_cc_timeout(void *arg);
 static void		pcib_pcie_dll_timeout(void *arg);
 #endif
 
 static device_method_t pcib_methods[] = {
     /* Device interface */
     DEVMETHOD(device_probe,		pcib_probe),
     DEVMETHOD(device_attach,		pcib_attach),
     DEVMETHOD(device_detach,		pcib_detach),
     DEVMETHOD(device_shutdown,		bus_generic_shutdown),
     DEVMETHOD(device_suspend,		pcib_suspend),
     DEVMETHOD(device_resume,		pcib_resume),
 
     /* Bus interface */
     DEVMETHOD(bus_child_present,	pcib_child_present),
     DEVMETHOD(bus_read_ivar,		pcib_read_ivar),
     DEVMETHOD(bus_write_ivar,		pcib_write_ivar),
     DEVMETHOD(bus_alloc_resource,	pcib_alloc_resource),
 #ifdef NEW_PCIB
     DEVMETHOD(bus_adjust_resource,	pcib_adjust_resource),
     DEVMETHOD(bus_release_resource,	pcib_release_resource),
 #else
     DEVMETHOD(bus_adjust_resource,	bus_generic_adjust_resource),
     DEVMETHOD(bus_release_resource,	bus_generic_release_resource),
 #endif
     DEVMETHOD(bus_activate_resource,	bus_generic_activate_resource),
     DEVMETHOD(bus_deactivate_resource,	bus_generic_deactivate_resource),
     DEVMETHOD(bus_setup_intr,		bus_generic_setup_intr),
     DEVMETHOD(bus_teardown_intr,	bus_generic_teardown_intr),
 
     /* pcib interface */
     DEVMETHOD(pcib_maxslots,		pcib_ari_maxslots),
     DEVMETHOD(pcib_maxfuncs,		pcib_ari_maxfuncs),
     DEVMETHOD(pcib_read_config,		pcib_read_config),
     DEVMETHOD(pcib_write_config,	pcib_write_config),
     DEVMETHOD(pcib_route_interrupt,	pcib_route_interrupt),
     DEVMETHOD(pcib_alloc_msi,		pcib_alloc_msi),
     DEVMETHOD(pcib_release_msi,		pcib_release_msi),
     DEVMETHOD(pcib_alloc_msix,		pcib_alloc_msix),
     DEVMETHOD(pcib_release_msix,	pcib_release_msix),
     DEVMETHOD(pcib_map_msi,		pcib_map_msi),
     DEVMETHOD(pcib_power_for_sleep,	pcib_power_for_sleep),
     DEVMETHOD(pcib_get_id,		pcib_ari_get_id),
     DEVMETHOD(pcib_try_enable_ari,	pcib_try_enable_ari),
     DEVMETHOD(pcib_ari_enabled,		pcib_ari_enabled),
     DEVMETHOD(pcib_decode_rid,		pcib_ari_decode_rid),
 
     DEVMETHOD_END
 };
 
 static devclass_t pcib_devclass;
 
 DEFINE_CLASS_0(pcib, pcib_driver, pcib_methods, sizeof(struct pcib_softc));
 DRIVER_MODULE(pcib, pci, pcib_driver, pcib_devclass, NULL, NULL);
 
 #ifdef NEW_PCIB
 SYSCTL_DECL(_hw_pci);
 
 static int pci_clear_pcib;
 SYSCTL_INT(_hw_pci, OID_AUTO, clear_pcib, CTLFLAG_RDTUN, &pci_clear_pcib, 0,
     "Clear firmware-assigned resources for PCI-PCI bridge I/O windows.");
 
 /*
  * Is a resource from a child device sub-allocated from one of our
  * resource managers?
  */
 static int
 pcib_is_resource_managed(struct pcib_softc *sc, int type, struct resource *r)
 {
 
 	switch (type) {
 #ifdef PCI_RES_BUS
 	case PCI_RES_BUS:
 		return (rman_is_region_manager(r, &sc->bus.rman));
 #endif
 	case SYS_RES_IOPORT:
 		return (rman_is_region_manager(r, &sc->io.rman));
 	case SYS_RES_MEMORY:
 		/* Prefetchable resources may live in either memory rman. */
 		if (rman_get_flags(r) & RF_PREFETCHABLE &&
 		    rman_is_region_manager(r, &sc->pmem.rman))
 			return (1);
 		return (rman_is_region_manager(r, &sc->mem.rman));
 	}
 	return (0);
 }
 
 static int
 pcib_is_window_open(struct pcib_window *pw)
 {
 
 	return (pw->valid && pw->base < pw->limit);
 }
 
 /*
  * XXX: If RF_ACTIVE did not also imply allocating a bus space tag and
  * handle for the resource, we could pass RF_ACTIVE up to the PCI bus
  * when allocating the resource windows and rely on the PCI bus driver
  * to do this for us.
  */
 static void
 pcib_activate_window(struct pcib_softc *sc, int type)
 {
 
 	PCI_ENABLE_IO(device_get_parent(sc->dev), sc->dev, type);
 }
 
 static void
 pcib_write_windows(struct pcib_softc *sc, int mask)
 {
 	device_t dev;
 	uint32_t val;
 
 	dev = sc->dev;
 	if (sc->io.valid && mask & WIN_IO) {
 		val = pci_read_config(dev, PCIR_IOBASEL_1, 1);
 		if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) {
 			pci_write_config(dev, PCIR_IOBASEH_1,
 			    sc->io.base >> 16, 2);
 			pci_write_config(dev, PCIR_IOLIMITH_1,
 			    sc->io.limit >> 16, 2);
 		}
 		pci_write_config(dev, PCIR_IOBASEL_1, sc->io.base >> 8, 1);
 		pci_write_config(dev, PCIR_IOLIMITL_1, sc->io.limit >> 8, 1);
 	}
 
 	if (mask & WIN_MEM) {
 		pci_write_config(dev, PCIR_MEMBASE_1, sc->mem.base >> 16, 2);
 		pci_write_config(dev, PCIR_MEMLIMIT_1, sc->mem.limit >> 16, 2);
 	}
 
 	if (sc->pmem.valid && mask & WIN_PMEM) {
 		val = pci_read_config(dev, PCIR_PMBASEL_1, 2);
 		if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) {
 			pci_write_config(dev, PCIR_PMBASEH_1,
 			    sc->pmem.base >> 32, 4);
 			pci_write_config(dev, PCIR_PMLIMITH_1,
 			    sc->pmem.limit >> 32, 4);
 		}
 		pci_write_config(dev, PCIR_PMBASEL_1, sc->pmem.base >> 16, 2);
 		pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmem.limit >> 16, 2);
 	}
 }
 
 /*
  * This is used to reject I/O port allocations that conflict with an
  * ISA alias range.
  */
 static int
 pcib_is_isa_range(struct pcib_softc *sc, rman_res_t start, rman_res_t end,
     rman_res_t count)
 {
 	rman_res_t next_alias;
 
 	if (!(sc->bridgectl & PCIB_BCR_ISA_ENABLE))
 		return (0);
 
 	/* Only check fixed ranges for overlap. */
 	if (start + count - 1 != end)
 		return (0);
 
 	/* ISA aliases are only in the lower 64KB of I/O space. */
 	if (start >= 65536)
 		return (0);
 
 	/* Check for overlap with 0x000 - 0x0ff as a special case. */
 	if (start < 0x100)
 		goto alias;
 
 	/*
 	 * If the start address is an alias, the range is an alias.
 	 * Otherwise, compute the start of the next alias range and
 	 * check if it is before the end of the candidate range.
 	 */
 	if ((start & 0x300) != 0)
 		goto alias;
 	next_alias = (start & ~0x3fful) | 0x100;
 	if (next_alias <= end)
 		goto alias;
 	return (0);
 
 alias:
 	if (bootverbose)
 		device_printf(sc->dev,
 		    "I/O range %#jx-%#jx overlaps with an ISA alias\n", start,
 		    end);
 	return (1);
 }
 
 static void
 pcib_add_window_resources(struct pcib_window *w, struct resource **res,
     int count)
 {
 	struct resource **newarray;
 	int error, i;
 
 	newarray = malloc(sizeof(struct resource *) * (w->count + count),
 	    M_DEVBUF, M_WAITOK);
 	if (w->res != NULL)
 		bcopy(w->res, newarray, sizeof(struct resource *) * w->count);
 	bcopy(res, newarray + w->count, sizeof(struct resource *) * count);
 	free(w->res, M_DEVBUF);
 	w->res = newarray;
 	w->count += count;
 
 	for (i = 0; i < count; i++) {
 		error = rman_manage_region(&w->rman, rman_get_start(res[i]),
 		    rman_get_end(res[i]));
 		if (error)
 			panic("Failed to add resource to rman");
 	}
 }
 
 typedef void (nonisa_callback)(rman_res_t start, rman_res_t end, void *arg);
 
 static void
 pcib_walk_nonisa_ranges(rman_res_t start, rman_res_t end, nonisa_callback *cb,
     void *arg)
 {
 	rman_res_t next_end;
 
 	/*
 	 * If start is within an ISA alias range, move up to the start
 	 * of the next non-alias range.  As a special case, addresses
 	 * in the range 0x000 - 0x0ff should also be skipped since
 	 * those are used for various system I/O devices in ISA
 	 * systems.
 	 */
 	if (start <= 65535) {
 		if (start < 0x100 || (start & 0x300) != 0) {
 			start &= ~0x3ff;
 			start += 0x400;
 		}
 	}
 
 	/* ISA aliases are only in the lower 64KB of I/O space. */
 	while (start <= MIN(end, 65535)) {
 		next_end = MIN(start | 0xff, end);
 		cb(start, next_end, arg);
 		start += 0x400;
 	}
 
 	if (start <= end)
 		cb(start, end, arg);
 }
 
 static void
 count_ranges(rman_res_t start, rman_res_t end, void *arg)
 {
 	int *countp;
 
 	countp = arg;
 	(*countp)++;
 }
 
 struct alloc_state {
 	struct resource **res;
 	struct pcib_softc *sc;
 	int count, error;
 };
 
 static void
 alloc_ranges(rman_res_t start, rman_res_t end, void *arg)
 {
 	struct alloc_state *as;
 	struct pcib_window *w;
 	int rid;
 
 	as = arg;
 	if (as->error != 0)
 		return;
 
 	w = &as->sc->io;
 	rid = w->reg;
 	if (bootverbose)
 		device_printf(as->sc->dev,
 		    "allocating non-ISA range %#jx-%#jx\n", start, end);
 	as->res[as->count] = bus_alloc_resource(as->sc->dev, SYS_RES_IOPORT,
 	    &rid, start, end, end - start + 1, 0);
 	if (as->res[as->count] == NULL)
 		as->error = ENXIO;
 	else
 		as->count++;
 }
 
 static int
 pcib_alloc_nonisa_ranges(struct pcib_softc *sc, rman_res_t start, rman_res_t end)
 {
 	struct alloc_state as;
 	int i, new_count;
 
 	/* First, see how many ranges we need. */
 	new_count = 0;
 	pcib_walk_nonisa_ranges(start, end, count_ranges, &new_count);
 
 	/* Second, allocate the ranges. */
 	as.res = malloc(sizeof(struct resource *) * new_count, M_DEVBUF,
 	    M_WAITOK);
 	as.sc = sc;
 	as.count = 0;
 	as.error = 0;
 	pcib_walk_nonisa_ranges(start, end, alloc_ranges, &as);
 	if (as.error != 0) {
 		for (i = 0; i < as.count; i++)
 			bus_release_resource(sc->dev, SYS_RES_IOPORT,
 			    sc->io.reg, as.res[i]);
 		free(as.res, M_DEVBUF);
 		return (as.error);
 	}
 	KASSERT(as.count == new_count, ("%s: count mismatch", __func__));
 
 	/* Third, add the ranges to the window. */
 	pcib_add_window_resources(&sc->io, as.res, as.count);
 	free(as.res, M_DEVBUF);
 	return (0);
 }
 
 static void
 pcib_alloc_window(struct pcib_softc *sc, struct pcib_window *w, int type,
     int flags, pci_addr_t max_address)
 {
 	struct resource *res;
 	char buf[64];
 	int error, rid;
 
 	if (max_address != (rman_res_t)max_address)
 		max_address = ~0;
 	w->rman.rm_start = 0;
 	w->rman.rm_end = max_address;
 	w->rman.rm_type = RMAN_ARRAY;
 	snprintf(buf, sizeof(buf), "%s %s window",
 	    device_get_nameunit(sc->dev), w->name);
 	w->rman.rm_descr = strdup(buf, M_DEVBUF);
 	error = rman_init(&w->rman);
 	if (error)
 		panic("Failed to initialize %s %s rman",
 		    device_get_nameunit(sc->dev), w->name);
 
 	if (!pcib_is_window_open(w))
 		return;
 
 	if (w->base > max_address || w->limit > max_address) {
 		device_printf(sc->dev,
 		    "initial %s window has too many bits, ignoring\n", w->name);
 		return;
 	}
 	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE)
 		(void)pcib_alloc_nonisa_ranges(sc, w->base, w->limit);
 	else {
 		rid = w->reg;
 		res = bus_alloc_resource(sc->dev, type, &rid, w->base, w->limit,
 		    w->limit - w->base + 1, flags);
 		if (res != NULL)
 			pcib_add_window_resources(w, &res, 1);
 	}
 	if (w->res == NULL) {
 		device_printf(sc->dev,
 		    "failed to allocate initial %s window: %#jx-%#jx\n",
 		    w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
 		w->base = max_address;
 		w->limit = 0;
 		pcib_write_windows(sc, w->mask);
 		return;
 	}
 	pcib_activate_window(sc, type);
 }
 
 /*
  * Initialize I/O windows.
  */
 static void
 pcib_probe_windows(struct pcib_softc *sc)
 {
 	pci_addr_t max;
 	device_t dev;
 	uint32_t val;
 
 	dev = sc->dev;
 
 	if (pci_clear_pcib) {
 		pcib_bridge_init(dev);
 	}
 
 	/* Determine if the I/O port window is implemented. */
 	val = pci_read_config(dev, PCIR_IOBASEL_1, 1);
 	if (val == 0) {
 		/*
 		 * If 'val' is zero, then only 16-bits of I/O space
 		 * are supported.
 		 */
 		pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1);
 		if (pci_read_config(dev, PCIR_IOBASEL_1, 1) != 0) {
 			sc->io.valid = 1;
 			pci_write_config(dev, PCIR_IOBASEL_1, 0, 1);
 		}
 	} else
 		sc->io.valid = 1;
 
 	/* Read the existing I/O port window. */
 	if (sc->io.valid) {
 		sc->io.reg = PCIR_IOBASEL_1;
 		sc->io.step = 12;
 		sc->io.mask = WIN_IO;
 		sc->io.name = "I/O port";
 		if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) {
 			sc->io.base = PCI_PPBIOBASE(
 			    pci_read_config(dev, PCIR_IOBASEH_1, 2), val);
 			sc->io.limit = PCI_PPBIOLIMIT(
 			    pci_read_config(dev, PCIR_IOLIMITH_1, 2),
 			    pci_read_config(dev, PCIR_IOLIMITL_1, 1));
 			max = 0xffffffff;
 		} else {
 			sc->io.base = PCI_PPBIOBASE(0, val);
 			sc->io.limit = PCI_PPBIOLIMIT(0,
 			    pci_read_config(dev, PCIR_IOLIMITL_1, 1));
 			max = 0xffff;
 		}
 		pcib_alloc_window(sc, &sc->io, SYS_RES_IOPORT, 0, max);
 	}
 
 	/* Read the existing memory window. */
 	sc->mem.valid = 1;
 	sc->mem.reg = PCIR_MEMBASE_1;
 	sc->mem.step = 20;
 	sc->mem.mask = WIN_MEM;
 	sc->mem.name = "memory";
 	sc->mem.base = PCI_PPBMEMBASE(0,
 	    pci_read_config(dev, PCIR_MEMBASE_1, 2));
 	sc->mem.limit = PCI_PPBMEMLIMIT(0,
 	    pci_read_config(dev, PCIR_MEMLIMIT_1, 2));
 	pcib_alloc_window(sc, &sc->mem, SYS_RES_MEMORY, 0, 0xffffffff);
 
 	/* Determine if the prefetchable memory window is implemented. */
 	val = pci_read_config(dev, PCIR_PMBASEL_1, 2);
 	if (val == 0) {
 		/*
 		 * If 'val' is zero, then only 32-bits of memory space
 		 * are supported.
 		 */
 		pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2);
 		if (pci_read_config(dev, PCIR_PMBASEL_1, 2) != 0) {
 			sc->pmem.valid = 1;
 			pci_write_config(dev, PCIR_PMBASEL_1, 0, 2);
 		}
 	} else
 		sc->pmem.valid = 1;
 
 	/* Read the existing prefetchable memory window. */
 	if (sc->pmem.valid) {
 		sc->pmem.reg = PCIR_PMBASEL_1;
 		sc->pmem.step = 20;
 		sc->pmem.mask = WIN_PMEM;
 		sc->pmem.name = "prefetch";
 		if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) {
 			sc->pmem.base = PCI_PPBMEMBASE(
 			    pci_read_config(dev, PCIR_PMBASEH_1, 4), val);
 			sc->pmem.limit = PCI_PPBMEMLIMIT(
 			    pci_read_config(dev, PCIR_PMLIMITH_1, 4),
 			    pci_read_config(dev, PCIR_PMLIMITL_1, 2));
 			max = 0xffffffffffffffff;
 		} else {
 			sc->pmem.base = PCI_PPBMEMBASE(0, val);
 			sc->pmem.limit = PCI_PPBMEMLIMIT(0,
 			    pci_read_config(dev, PCIR_PMLIMITL_1, 2));
 			max = 0xffffffff;
 		}
 		pcib_alloc_window(sc, &sc->pmem, SYS_RES_MEMORY,
 		    RF_PREFETCHABLE, max);
 	}
 }
 
 static void
 pcib_release_window(struct pcib_softc *sc, struct pcib_window *w, int type)
 {
 	device_t dev;
 	int error, i;
 
 	if (!w->valid)
 		return;
 
 	dev = sc->dev;
 	error = rman_fini(&w->rman);
 	if (error) {
 		device_printf(dev, "failed to release %s rman\n", w->name);
 		return;
 	}
 	free(__DECONST(char *, w->rman.rm_descr), M_DEVBUF);
 
 	for (i = 0; i < w->count; i++) {
 		error = bus_free_resource(dev, type, w->res[i]);
 		if (error)
 			device_printf(dev,
 			    "failed to release %s resource: %d\n", w->name,
 			    error);
 	}
 	free(w->res, M_DEVBUF);
 }
 
 static void
 pcib_free_windows(struct pcib_softc *sc)
 {
 
 	pcib_release_window(sc, &sc->pmem, SYS_RES_MEMORY);
 	pcib_release_window(sc, &sc->mem, SYS_RES_MEMORY);
 	pcib_release_window(sc, &sc->io, SYS_RES_IOPORT);
 }
 
 #ifdef PCI_RES_BUS
 /*
  * Allocate a suitable secondary bus for this bridge if needed and
  * initialize the resource manager for the secondary bus range.  Note
  * that the minimum count is a desired value and this may allocate a
  * smaller range.
  */
 void
 pcib_setup_secbus(device_t dev, struct pcib_secbus *bus, int min_count)
 {
 	char buf[64];
 	int error, rid, sec_reg;
 
 	switch (pci_read_config(dev, PCIR_HDRTYPE, 1) & PCIM_HDRTYPE) {
 	case PCIM_HDRTYPE_BRIDGE:
 		sec_reg = PCIR_SECBUS_1;
 		bus->sub_reg = PCIR_SUBBUS_1;
 		break;
 	case PCIM_HDRTYPE_CARDBUS:
 		sec_reg = PCIR_SECBUS_2;
 		bus->sub_reg = PCIR_SUBBUS_2;
 		break;
 	default:
 		panic("not a PCI bridge");
 	}
 	bus->sec = pci_read_config(dev, sec_reg, 1);
 	bus->sub = pci_read_config(dev, bus->sub_reg, 1);
 	bus->dev = dev;
 	bus->rman.rm_start = 0;
 	bus->rman.rm_end = PCI_BUSMAX;
 	bus->rman.rm_type = RMAN_ARRAY;
 	snprintf(buf, sizeof(buf), "%s bus numbers", device_get_nameunit(dev));
 	bus->rman.rm_descr = strdup(buf, M_DEVBUF);
 	error = rman_init(&bus->rman);
 	if (error)
 		panic("Failed to initialize %s bus number rman",
 		    device_get_nameunit(dev));
 
 	/*
 	 * Allocate a bus range.  This will return an existing bus range
 	 * if one exists, or a new bus range if one does not.
 	 */
 	rid = 0;
 	bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid,
 	    min_count, 0);
 	if (bus->res == NULL) {
 		/*
 		 * Fall back to just allocating a range of a single bus
 		 * number.
 		 */
 		bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid,
 		    1, 0);
 	} else if (rman_get_size(bus->res) < min_count)
 		/*
 		 * Attempt to grow the existing range to satisfy the
 		 * minimum desired count.
 		 */
 		(void)bus_adjust_resource(dev, PCI_RES_BUS, bus->res,
 		    rman_get_start(bus->res), rman_get_start(bus->res) +
 		    min_count - 1);
 
 	/*
 	 * Add the initial resource to the rman.
 	 */
 	if (bus->res != NULL) {
 		error = rman_manage_region(&bus->rman, rman_get_start(bus->res),
 		    rman_get_end(bus->res));
 		if (error)
 			panic("Failed to add resource to rman");
 		bus->sec = rman_get_start(bus->res);
 		bus->sub = rman_get_end(bus->res);
 	}
 }
 
 void
 pcib_free_secbus(device_t dev, struct pcib_secbus *bus)
 {
 	int error;
 
 	error = rman_fini(&bus->rman);
 	if (error) {
 		device_printf(dev, "failed to release bus number rman\n");
 		return;
 	}
 	free(__DECONST(char *, bus->rman.rm_descr), M_DEVBUF);
 
 	error = bus_free_resource(dev, PCI_RES_BUS, bus->res);
 	if (error)
 		device_printf(dev,
 		    "failed to release bus numbers resource: %d\n", error);
 }
 
 static struct resource *
 pcib_suballoc_bus(struct pcib_secbus *bus, device_t child, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource *res;
 
 	res = rman_reserve_resource(&bus->rman, start, end, count, flags,
 	    child);
 	if (res == NULL)
 		return (NULL);
 
 	if (bootverbose)
 		device_printf(bus->dev,
 		    "allocated bus range (%ju-%ju) for rid %d of %s\n",
 		    rman_get_start(res), rman_get_end(res), *rid,
 		    pcib_child_name(child));
 	rman_set_rid(res, *rid);
 	return (res);
 }
 
 /*
  * Attempt to grow the secondary bus range.  This is much simpler than
  * for I/O windows as the range can only be grown by increasing
  * subbus.
  */
 static int
 pcib_grow_subbus(struct pcib_secbus *bus, rman_res_t new_end)
 {
 	rman_res_t old_end;
 	int error;
 
 	old_end = rman_get_end(bus->res);
 	KASSERT(new_end > old_end, ("attempt to shrink subbus"));
 	error = bus_adjust_resource(bus->dev, PCI_RES_BUS, bus->res,
 	    rman_get_start(bus->res), new_end);
 	if (error)
 		return (error);
 	if (bootverbose)
 		device_printf(bus->dev, "grew bus range to %ju-%ju\n",
 		    rman_get_start(bus->res), rman_get_end(bus->res));
 	error = rman_manage_region(&bus->rman, old_end + 1,
 	    rman_get_end(bus->res));
 	if (error)
 		panic("Failed to add resource to rman");
 	bus->sub = rman_get_end(bus->res);
 	pci_write_config(bus->dev, bus->sub_reg, bus->sub, 1);
 	return (0);
 }
 
 struct resource *
 pcib_alloc_subbus(struct pcib_secbus *bus, device_t child, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource *res;
 	rman_res_t start_free, end_free, new_end;
 
 	/*
 	 * First, see if the request can be satisified by the existing
 	 * bus range.
 	 */
 	res = pcib_suballoc_bus(bus, child, rid, start, end, count, flags);
 	if (res != NULL)
 		return (res);
 
 	/*
 	 * Figure out a range to grow the bus range.  First, find the
 	 * first bus number after the last allocated bus in the rman and
 	 * enforce that as a minimum starting point for the range.
 	 */
 	if (rman_last_free_region(&bus->rman, &start_free, &end_free) != 0 ||
 	    end_free != bus->sub)
 		start_free = bus->sub + 1;
 	if (start_free < start)
 		start_free = start;
 	new_end = start_free + count - 1;
 
 	/*
 	 * See if this new range would satisfy the request if it
 	 * succeeds.
 	 */
 	if (new_end > end)
 		return (NULL);
 
 	/* Finally, attempt to grow the existing resource. */
 	if (bootverbose) {
 		device_printf(bus->dev,
 		    "attempting to grow bus range for %ju buses\n", count);
 		printf("\tback candidate range: %ju-%ju\n", start_free,
 		    new_end);
 	}
 	if (pcib_grow_subbus(bus, new_end) == 0)
 		return (pcib_suballoc_bus(bus, child, rid, start, end, count,
 		    flags));
 	return (NULL);
 }
 #endif
 
 #else
 
 /*
  * Is the prefetch window open (eg, can we allocate memory in it?)
  */
 static int
 pcib_is_prefetch_open(struct pcib_softc *sc)
 {
 	return (sc->pmembase > 0 && sc->pmembase < sc->pmemlimit);
 }
 
 /*
  * Is the nonprefetch window open (eg, can we allocate memory in it?)
  */
 static int
 pcib_is_nonprefetch_open(struct pcib_softc *sc)
 {
 	return (sc->membase > 0 && sc->membase < sc->memlimit);
 }
 
 /*
  * Is the io window open (eg, can we allocate ports in it?)
  */
 static int
 pcib_is_io_open(struct pcib_softc *sc)
 {
 	return (sc->iobase > 0 && sc->iobase < sc->iolimit);
 }
 
 /*
  * Get current I/O decode.
  */
 static void
 pcib_get_io_decode(struct pcib_softc *sc)
 {
 	device_t	dev;
 	uint32_t	iolow;
 
 	dev = sc->dev;
 
 	iolow = pci_read_config(dev, PCIR_IOBASEL_1, 1);
 	if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32)
 		sc->iobase = PCI_PPBIOBASE(
 		    pci_read_config(dev, PCIR_IOBASEH_1, 2), iolow);
 	else
 		sc->iobase = PCI_PPBIOBASE(0, iolow);
 
 	iolow = pci_read_config(dev, PCIR_IOLIMITL_1, 1);
 	if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32)
 		sc->iolimit = PCI_PPBIOLIMIT(
 		    pci_read_config(dev, PCIR_IOLIMITH_1, 2), iolow);
 	else
 		sc->iolimit = PCI_PPBIOLIMIT(0, iolow);
 }
 
 /*
  * Get current memory decode.
  */
 static void
 pcib_get_mem_decode(struct pcib_softc *sc)
 {
 	device_t	dev;
 	pci_addr_t	pmemlow;
 
 	dev = sc->dev;
 
 	sc->membase = PCI_PPBMEMBASE(0,
 	    pci_read_config(dev, PCIR_MEMBASE_1, 2));
 	sc->memlimit = PCI_PPBMEMLIMIT(0,
 	    pci_read_config(dev, PCIR_MEMLIMIT_1, 2));
 
 	pmemlow = pci_read_config(dev, PCIR_PMBASEL_1, 2);
 	if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64)
 		sc->pmembase = PCI_PPBMEMBASE(
 		    pci_read_config(dev, PCIR_PMBASEH_1, 4), pmemlow);
 	else
 		sc->pmembase = PCI_PPBMEMBASE(0, pmemlow);
 
 	pmemlow = pci_read_config(dev, PCIR_PMLIMITL_1, 2);
 	if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64)
 		sc->pmemlimit = PCI_PPBMEMLIMIT(
 		    pci_read_config(dev, PCIR_PMLIMITH_1, 4), pmemlow);
 	else
 		sc->pmemlimit = PCI_PPBMEMLIMIT(0, pmemlow);
 }
 
 /*
  * Restore previous I/O decode.
  */
 static void
 pcib_set_io_decode(struct pcib_softc *sc)
 {
 	device_t	dev;
 	uint32_t	iohi;
 
 	dev = sc->dev;
 
 	iohi = sc->iobase >> 16;
 	if (iohi > 0)
 		pci_write_config(dev, PCIR_IOBASEH_1, iohi, 2);
 	pci_write_config(dev, PCIR_IOBASEL_1, sc->iobase >> 8, 1);
 
 	iohi = sc->iolimit >> 16;
 	if (iohi > 0)
 		pci_write_config(dev, PCIR_IOLIMITH_1, iohi, 2);
 	pci_write_config(dev, PCIR_IOLIMITL_1, sc->iolimit >> 8, 1);
 }
 
 /*
  * Restore previous memory decode.
  */
 static void
 pcib_set_mem_decode(struct pcib_softc *sc)
 {
 	device_t	dev;
 	pci_addr_t	pmemhi;
 
 	dev = sc->dev;
 
 	pci_write_config(dev, PCIR_MEMBASE_1, sc->membase >> 16, 2);
 	pci_write_config(dev, PCIR_MEMLIMIT_1, sc->memlimit >> 16, 2);
 
 	pmemhi = sc->pmembase >> 32;
 	if (pmemhi > 0)
 		pci_write_config(dev, PCIR_PMBASEH_1, pmemhi, 4);
 	pci_write_config(dev, PCIR_PMBASEL_1, sc->pmembase >> 16, 2);
 
 	pmemhi = sc->pmemlimit >> 32;
 	if (pmemhi > 0)
 		pci_write_config(dev, PCIR_PMLIMITH_1, pmemhi, 4);
 	pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmemlimit >> 16, 2);
 }
 #endif
 
 #ifdef PCI_HP
 /*
  * PCI-express HotPlug support.
  */
+static int pci_enable_pcie_hp = 1;
+SYSCTL_INT(_hw_pci, OID_AUTO, enable_pcie_hp, CTLFLAG_RDTUN,
+    &pci_enable_pcie_hp, 0,
+    "Enable support for native PCI-express HotPlug.");
+
 static void
 pcib_probe_hotplug(struct pcib_softc *sc)
 {
 	device_t dev;
+
+	if (!pci_enable_pcie_hp)
+		return;
 
 	dev = sc->dev;
 	if (pci_find_cap(dev, PCIY_EXPRESS, NULL) != 0)
 		return;
 
 	if (!(pcie_read_config(dev, PCIER_FLAGS, 2) & PCIEM_FLAGS_SLOT))
 		return;
 
 	sc->pcie_link_cap = pcie_read_config(dev, PCIER_LINK_CAP, 4);
 	sc->pcie_slot_cap = pcie_read_config(dev, PCIER_SLOT_CAP, 4);
 
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_HPC)
 		sc->flags |= PCIB_HOTPLUG;
 }
 
 /*
  * Send a HotPlug command to the slot control register.  If this slot
  * uses command completion interrupts and a previous command is still
  * in progress, then the command is dropped.  Once the previous
  * command completes or times out, pcib_pcie_hotplug_update() will be
  * invoked to post a new command based on the slot's state at that
  * time.
  */
 static void
 pcib_pcie_hotplug_command(struct pcib_softc *sc, uint16_t val, uint16_t mask)
 {
 	device_t dev;
 	uint16_t ctl, new;
 
 	dev = sc->dev;
 
 	if (sc->flags & PCIB_HOTPLUG_CMD_PENDING)
 		return;
 
 	ctl = pcie_read_config(dev, PCIER_SLOT_CTL, 2);
 	new = (ctl & ~mask) | val;
 	if (new == ctl)
 		return;
 	pcie_write_config(dev, PCIER_SLOT_CTL, new, 2);
 	if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS) &&
 	    (ctl & new) & PCIEM_SLOT_CTL_CCIE) {
 		sc->flags |= PCIB_HOTPLUG_CMD_PENDING;
 		if (!cold)
 			callout_reset(&sc->pcie_cc_timer, hz,
 			    pcib_pcie_cc_timeout, sc);
 	}
 }
 
 static void
 pcib_pcie_hotplug_command_completed(struct pcib_softc *sc)
 {
 	device_t dev;
 
 	dev = sc->dev;
 
 	if (bootverbose)
 		device_printf(dev, "Command Completed\n");
 	if (!(sc->flags & PCIB_HOTPLUG_CMD_PENDING))
 		return;
 	callout_stop(&sc->pcie_cc_timer);
 	sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING;
 	wakeup(sc);
 }
 
 /*
  * Returns true if a card is fully inserted from the user's
  * perspective.  It may not yet be ready for access, but the driver
  * can now start enabling access if necessary.
  */
 static bool
 pcib_hotplug_inserted(struct pcib_softc *sc)
 {
 
 	/* Pretend the card isn't present if a detach is forced. */
 	if (sc->flags & PCIB_DETACHING)
 		return (false);
 
 	/* Card must be present in the slot. */
 	if ((sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS) == 0)
 		return (false);
 
 	/* A power fault implicitly turns off power to the slot. */
 	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD)
 		return (false);
 
 	/* If the MRL is disengaged, the slot is powered off. */
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP &&
 	    (sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS) != 0)
 		return (false);
 
 	return (true);
 }
 
 /*
  * Returns -1 if the card is fully inserted, powered, and ready for
  * access.  Otherwise, returns 0.
  */
 static int
 pcib_hotplug_present(struct pcib_softc *sc)
 {
 	device_t dev;
 
 	dev = sc->dev;
 
 	/* Card must be inserted. */
 	if (!pcib_hotplug_inserted(sc))
 		return (0);
 
 	/*
 	 * Require the Electromechanical Interlock to be engaged if
 	 * present.
 	 */
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP &&
 	    (sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS) == 0)
 		return (0);
 
 	/* Require the Data Link Layer to be active. */
 	if (sc->pcie_link_cap & PCIEM_LINK_CAP_DL_ACTIVE) {
 		if (!(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE))
 			return (0);
 	}
 
 	return (-1);
 }
 
 static void
 pcib_pcie_hotplug_update(struct pcib_softc *sc, uint16_t val, uint16_t mask,
     bool schedule_task)
 {
 	bool card_inserted;
 
 	/* Clear DETACHING if Present Detect has cleared. */
 	if ((sc->pcie_slot_sta & (PCIEM_SLOT_STA_PDC | PCIEM_SLOT_STA_PDS)) ==
 	    PCIEM_SLOT_STA_PDC)
 		sc->flags &= ~PCIB_DETACHING;
 
 	card_inserted = pcib_hotplug_inserted(sc);
 
 	/* Turn the power indicator on if a card is inserted. */
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PIP) {
 		mask |= PCIEM_SLOT_CTL_PIC;
 		if (card_inserted)
 			val |= PCIEM_SLOT_CTL_PI_ON;
 		else if (sc->flags & PCIB_DETACH_PENDING)
 			val |= PCIEM_SLOT_CTL_PI_BLINK;
 		else
 			val |= PCIEM_SLOT_CTL_PI_OFF;
 	}
 
 	/* Turn the power on via the Power Controller if a card is inserted. */
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP) {
 		mask |= PCIEM_SLOT_CTL_PCC;
 		if (card_inserted)
 			val |= PCIEM_SLOT_CTL_PC_ON;
 		else
 			val |= PCIEM_SLOT_CTL_PC_OFF;
 	}
 
 	/*
 	 * If a card is inserted, enable the Electromechanical
 	 * Interlock.  If a card is not inserted (or we are in the
 	 * process of detaching), disable the Electromechanical
 	 * Interlock.
 	 */
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP) {
 		mask |= PCIEM_SLOT_CTL_EIC;
 		if (card_inserted !=
 		    !(sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS))
 			val |= PCIEM_SLOT_CTL_EIC;
 	}
 
 	/*
 	 * Start a timer to see if the Data Link Layer times out.
 	 * Note that we only start the timer if Presence Detect
 	 * changed on this interrupt.  Stop any scheduled timer if
 	 * the Data Link Layer is active.
 	 */
 	if (sc->pcie_link_cap & PCIEM_LINK_CAP_DL_ACTIVE) {
 		if (card_inserted &&
 		    !(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE) &&
 		    sc->pcie_slot_sta & PCIEM_SLOT_STA_PDC) {
 			if (cold)
 				device_printf(sc->dev,
 				    "Data Link Layer inactive\n");
 			else
 				callout_reset(&sc->pcie_dll_timer, hz,
 				    pcib_pcie_dll_timeout, sc);
 		} else if (sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE)
 			callout_stop(&sc->pcie_dll_timer);
 	}
 
 	pcib_pcie_hotplug_command(sc, val, mask);
 
 	/*
 	 * During attach the child "pci" device is added sychronously;
 	 * otherwise, the task is scheduled to manage the child
 	 * device.
 	 */
 	if (schedule_task &&
 	    (pcib_hotplug_present(sc) != 0) != (sc->child != NULL))
 		taskqueue_enqueue(taskqueue_thread, &sc->pcie_hp_task);
 }
 
 static void
 pcib_pcie_intr(void *arg)
 {
 	struct pcib_softc *sc;
 	device_t dev;
 
 	sc = arg;
 	dev = sc->dev;
 	sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
 
 	/* Clear the events just reported. */
 	pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2);
 
 	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_ABP) {
 		if (sc->flags & PCIB_DETACH_PENDING) {	
 			device_printf(dev,
 			    "Attention Button Pressed: Detach Cancelled\n");
 			sc->flags &= ~PCIB_DETACH_PENDING;
 			callout_stop(&sc->pcie_ab_timer);
 		} else {
 			device_printf(dev,
 		    "Attention Button Pressed: Detaching in 5 seconds\n");
 			sc->flags |= PCIB_DETACH_PENDING;
 			callout_reset(&sc->pcie_ab_timer, 5 * hz,
 			    pcib_pcie_ab_timeout, sc);
 		}
 	}
 	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD)
 		device_printf(dev, "Power Fault Detected\n");
 	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSC)
 		device_printf(dev, "MRL Sensor Changed to %s\n",
 		    sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS ? "open" :
 		    "closed");
 	if (bootverbose && sc->pcie_slot_sta & PCIEM_SLOT_STA_PDC)
 		device_printf(dev, "Present Detect Changed to %s\n",
 		    sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS ? "card present" :
 		    "empty");
 	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_CC)
 		pcib_pcie_hotplug_command_completed(sc);
 	if (sc->pcie_slot_sta & PCIEM_SLOT_STA_DLLSC) {
 		sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
 		if (bootverbose)
 			device_printf(dev,
 			    "Data Link Layer State Changed to %s\n",
 			    sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE ?
 			    "active" : "inactive");
 	}
 
 	pcib_pcie_hotplug_update(sc, 0, 0, true);
 }
 
 static void
 pcib_pcie_hotplug_task(void *context, int pending)
 {
 	struct pcib_softc *sc;
 	device_t dev;
 
 	sc = context;
 	mtx_lock(&Giant);
 	dev = sc->dev;
 	if (pcib_hotplug_present(sc) != 0) {
 		if (sc->child == NULL) {
 			sc->child = device_add_child(dev, "pci", -1);
 			bus_generic_attach(dev);
 		}
 	} else {
 		if (sc->child != NULL) {
 			if (device_delete_child(dev, sc->child) == 0)
 				sc->child = NULL;
 		}
 	}
 	mtx_unlock(&Giant);
 }
 
 static void
 pcib_pcie_ab_timeout(void *arg)
 {
 	struct pcib_softc *sc;
 	device_t dev;
 
 	sc = arg;
 	dev = sc->dev;
 	mtx_assert(&Giant, MA_OWNED);
 	if (sc->flags & PCIB_DETACH_PENDING) {
 		sc->flags |= PCIB_DETACHING;
 		sc->flags &= ~PCIB_DETACH_PENDING;
 		pcib_pcie_hotplug_update(sc, 0, 0, true);
 	}
 }
 
 static void
 pcib_pcie_cc_timeout(void *arg)
 {
 	struct pcib_softc *sc;
 	device_t dev;
 	uint16_t sta;
 
 	sc = arg;
 	dev = sc->dev;
 	mtx_assert(&Giant, MA_OWNED);
 	sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
 	if (!(sta & PCIEM_SLOT_STA_CC)) {
 		device_printf(dev,
 		    "Hotplug Command Timed Out - forcing detach\n");
 		sc->flags &= ~(PCIB_HOTPLUG_CMD_PENDING | PCIB_DETACH_PENDING);
 		sc->flags |= PCIB_DETACHING;
 		pcib_pcie_hotplug_update(sc, 0, 0, true);
 	} else {
 		device_printf(dev,
 	    "Missed HotPlug interrupt waiting for Command Completion\n");
 		pcib_pcie_intr(sc);
 	}
 }
 
 static void
 pcib_pcie_dll_timeout(void *arg)
 {
 	struct pcib_softc *sc;
 	device_t dev;
 	uint16_t sta;
 
 	sc = arg;
 	dev = sc->dev;
 	mtx_assert(&Giant, MA_OWNED);
 	sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
 	if (!(sta & PCIEM_LINK_STA_DL_ACTIVE)) {
 		device_printf(dev,
 		    "Timed out waiting for Data Link Layer Active\n");
 		sc->flags |= PCIB_DETACHING;
 		pcib_pcie_hotplug_update(sc, 0, 0, true);
 	} else if (sta != sc->pcie_link_sta) {
 		device_printf(dev,
 		    "Missed HotPlug interrupt waiting for DLL Active\n");
 		pcib_pcie_intr(sc);
 	}
 }
 
 static int
 pcib_alloc_pcie_irq(struct pcib_softc *sc)
 {
 	device_t dev;
 	int count, error, rid;
 
 	rid = -1;
 	dev = sc->dev;
 
 	/*
 	 * For simplicity, only use MSI-X if there is a single message.
 	 * To support a device with multiple messages we would have to
 	 * use remap intr if the MSI number is not 0.
 	 */
 	count = pci_msix_count(dev);
 	if (count == 1) {
 		error = pci_alloc_msix(dev, &count);
 		if (error == 0)
 			rid = 1;
 	}
 
 	if (rid < 0 && pci_msi_count(dev) > 0) {
 		count = 1;
 		error = pci_alloc_msi(dev, &count);
 		if (error == 0)
 			rid = 1;
 	}
 
 	if (rid < 0)
 		rid = 0;
 
 	sc->pcie_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
 	    RF_ACTIVE);
 	if (sc->pcie_irq == NULL) {
 		device_printf(dev,
 		    "Failed to allocate interrupt for PCI-e events\n");
 		if (rid > 0)
 			pci_release_msi(dev);
 		return (ENXIO);
 	}
 
 	error = bus_setup_intr(dev, sc->pcie_irq, INTR_TYPE_MISC,
 	    NULL, pcib_pcie_intr, sc, &sc->pcie_ihand);
 	if (error) {
 		device_printf(dev, "Failed to setup PCI-e interrupt handler\n");
 		bus_release_resource(dev, SYS_RES_IRQ, rid, sc->pcie_irq);
 		if (rid > 0)
 			pci_release_msi(dev);
 		return (error);
 	}
 	return (0);
 }
 
 static int
 pcib_release_pcie_irq(struct pcib_softc *sc)
 {
 	device_t dev;
 	int error;
 
 	dev = sc->dev;
 	error = bus_teardown_intr(dev, sc->pcie_irq, sc->pcie_ihand);
 	if (error)
 		return (error);
 	error = bus_free_resource(dev, SYS_RES_IRQ, sc->pcie_irq);
 	if (error)
 		return (error);
 	return (pci_release_msi(dev));
 }
 
 static void
 pcib_setup_hotplug(struct pcib_softc *sc)
 {
 	device_t dev;
 	uint16_t mask, val;
 
 	dev = sc->dev;
 	callout_init(&sc->pcie_ab_timer, 0);
 	callout_init(&sc->pcie_cc_timer, 0);
 	callout_init(&sc->pcie_dll_timer, 0);
 	TASK_INIT(&sc->pcie_hp_task, 0, pcib_pcie_hotplug_task, sc);
 
 	/* Allocate IRQ. */
 	if (pcib_alloc_pcie_irq(sc) != 0)
 		return;
 
 	sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2);
 	sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2);
 
 	/* Clear any events previously pending. */
 	pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2);
 
 	/* Enable HotPlug events. */
 	mask = PCIEM_SLOT_CTL_DLLSCE | PCIEM_SLOT_CTL_HPIE |
 	    PCIEM_SLOT_CTL_CCIE | PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_MRLSCE |
 	    PCIEM_SLOT_CTL_PFDE | PCIEM_SLOT_CTL_ABPE;
 	val = PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_HPIE;
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_APB)
 		val |= PCIEM_SLOT_CTL_ABPE;
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP)
 		val |= PCIEM_SLOT_CTL_PFDE;
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP)
 		val |= PCIEM_SLOT_CTL_MRLSCE;
 	if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS))
 		val |= PCIEM_SLOT_CTL_CCIE;
 	if (sc->pcie_link_cap & PCIEM_LINK_CAP_DL_ACTIVE)
 		val |= PCIEM_SLOT_CTL_DLLSCE;
 
 	/* Turn the attention indicator off. */
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) {
 		mask |= PCIEM_SLOT_CTL_AIC;
 		val |= PCIEM_SLOT_CTL_AI_OFF;
 	}
 
 	pcib_pcie_hotplug_update(sc, val, mask, false);
 }
 
 static int
 pcib_detach_hotplug(struct pcib_softc *sc)
 {
 	uint16_t mask, val;
 	int error;
 
 	/* Disable the card in the slot and force it to detach. */
 	if (sc->flags & PCIB_DETACH_PENDING) {
 		sc->flags &= ~PCIB_DETACH_PENDING;
 		callout_stop(&sc->pcie_ab_timer);
 	}
 	sc->flags |= PCIB_DETACHING;
 
 	if (sc->flags & PCIB_HOTPLUG_CMD_PENDING) {
 		callout_stop(&sc->pcie_cc_timer);
 		tsleep(sc, 0, "hpcmd", hz);
 		sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING;
 	}
 
 	/* Disable HotPlug events. */
 	mask = PCIEM_SLOT_CTL_DLLSCE | PCIEM_SLOT_CTL_HPIE |
 	    PCIEM_SLOT_CTL_CCIE | PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_MRLSCE |
 	    PCIEM_SLOT_CTL_PFDE | PCIEM_SLOT_CTL_ABPE;
 	val = 0;
 
 	/* Turn the attention indicator off. */
 	if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) {
 		mask |= PCIEM_SLOT_CTL_AIC;
 		val |= PCIEM_SLOT_CTL_AI_OFF;
 	}
 
 	pcib_pcie_hotplug_update(sc, val, mask, false);
 	
 	error = pcib_release_pcie_irq(sc);
 	if (error)
 		return (error);
 	taskqueue_drain(taskqueue_thread, &sc->pcie_hp_task);
 	callout_drain(&sc->pcie_ab_timer);
 	callout_drain(&sc->pcie_cc_timer);
 	callout_drain(&sc->pcie_dll_timer);
 	return (0);
 }
 #endif
 
 /*
  * Get current bridge configuration.
  */
 static void
 pcib_cfg_save(struct pcib_softc *sc)
 {
 #ifndef NEW_PCIB
 	device_t	dev;
 	uint16_t command;
 
 	dev = sc->dev;
 
 	command = pci_read_config(dev, PCIR_COMMAND, 2);
 	if (command & PCIM_CMD_PORTEN)
 		pcib_get_io_decode(sc);
 	if (command & PCIM_CMD_MEMEN)
 		pcib_get_mem_decode(sc);
 #endif
 }
 
 /*
  * Restore previous bridge configuration.
  */
 static void
 pcib_cfg_restore(struct pcib_softc *sc)
 {
 	device_t	dev;
 #ifndef NEW_PCIB
 	uint16_t command;
 #endif
 	dev = sc->dev;
 
 #ifdef NEW_PCIB
 	pcib_write_windows(sc, WIN_IO | WIN_MEM | WIN_PMEM);
 #else
 	command = pci_read_config(dev, PCIR_COMMAND, 2);
 	if (command & PCIM_CMD_PORTEN)
 		pcib_set_io_decode(sc);
 	if (command & PCIM_CMD_MEMEN)
 		pcib_set_mem_decode(sc);
 #endif
 }
 
 /*
  * Generic device interface
  */
 static int
 pcib_probe(device_t dev)
 {
     if ((pci_get_class(dev) == PCIC_BRIDGE) &&
 	(pci_get_subclass(dev) == PCIS_BRIDGE_PCI)) {
 	device_set_desc(dev, "PCI-PCI bridge");
 	return(-10000);
     }
     return(ENXIO);
 }
 
 void
 pcib_attach_common(device_t dev)
 {
     struct pcib_softc	*sc;
     struct sysctl_ctx_list *sctx;
     struct sysctl_oid	*soid;
     int comma;
 
     sc = device_get_softc(dev);
     sc->dev = dev;
 
     /*
      * Get current bridge configuration.
      */
     sc->domain = pci_get_domain(dev);
 #if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
     sc->bus.sec = pci_read_config(dev, PCIR_SECBUS_1, 1);
     sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1);
 #endif
     sc->bridgectl = pci_read_config(dev, PCIR_BRIDGECTL_1, 2);
     pcib_cfg_save(sc);
 
     /*
      * The primary bus register should always be the bus of the
      * parent.
      */
     sc->pribus = pci_get_bus(dev);
     pci_write_config(dev, PCIR_PRIBUS_1, sc->pribus, 1);
 
     /*
      * Setup sysctl reporting nodes
      */
     sctx = device_get_sysctl_ctx(dev);
     soid = device_get_sysctl_tree(dev);
     SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "domain",
       CTLFLAG_RD, &sc->domain, 0, "Domain number");
     SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "pribus",
       CTLFLAG_RD, &sc->pribus, 0, "Primary bus number");
     SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "secbus",
       CTLFLAG_RD, &sc->bus.sec, 0, "Secondary bus number");
     SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "subbus",
       CTLFLAG_RD, &sc->bus.sub, 0, "Subordinate bus number");
 
     /*
      * Quirk handling.
      */
     switch (pci_get_devid(dev)) {
 #if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
     case 0x12258086:		/* Intel 82454KX/GX (Orion) */
 	{
 	    uint8_t	supbus;
 
 	    supbus = pci_read_config(dev, 0x41, 1);
 	    if (supbus != 0xff) {
 		sc->bus.sec = supbus + 1;
 		sc->bus.sub = supbus + 1;
 	    }
 	    break;
 	}
 #endif
 
     /*
      * The i82380FB mobile docking controller is a PCI-PCI bridge,
      * and it is a subtractive bridge.  However, the ProgIf is wrong
      * so the normal setting of PCIB_SUBTRACTIVE bit doesn't
      * happen.  There are also Toshiba and Cavium ThunderX bridges
      * that behave this way.
      */
     case 0xa002177d:		/* Cavium ThunderX */
     case 0x124b8086:		/* Intel 82380FB Mobile */
     case 0x060513d7:		/* Toshiba ???? */
 	sc->flags |= PCIB_SUBTRACTIVE;
 	break;
 
 #if !(defined(NEW_PCIB) && defined(PCI_RES_BUS))
     /* Compaq R3000 BIOS sets wrong subordinate bus number. */
     case 0x00dd10de:
 	{
 	    char *cp;
 
 	    if ((cp = kern_getenv("smbios.planar.maker")) == NULL)
 		break;
 	    if (strncmp(cp, "Compal", 6) != 0) {
 		freeenv(cp);
 		break;
 	    }
 	    freeenv(cp);
 	    if ((cp = kern_getenv("smbios.planar.product")) == NULL)
 		break;
 	    if (strncmp(cp, "08A0", 4) != 0) {
 		freeenv(cp);
 		break;
 	    }
 	    freeenv(cp);
 	    if (sc->bus.sub < 0xa) {
 		pci_write_config(dev, PCIR_SUBBUS_1, 0xa, 1);
 		sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1);
 	    }
 	    break;
 	}
 #endif
     }
 
     if (pci_msi_device_blacklisted(dev))
 	sc->flags |= PCIB_DISABLE_MSI;
 
     if (pci_msix_device_blacklisted(dev))
 	sc->flags |= PCIB_DISABLE_MSIX;
 
     /*
      * Intel 815, 845 and other chipsets say they are PCI-PCI bridges,
      * but have a ProgIF of 0x80.  The 82801 family (AA, AB, BAM/CAM,
      * BA/CA/DB and E) PCI bridges are HUB-PCI bridges, in Intelese.
      * This means they act as if they were subtractively decoding
      * bridges and pass all transactions.  Mark them and real ProgIf 1
      * parts as subtractive.
      */
     if ((pci_get_devid(dev) & 0xff00ffff) == 0x24008086 ||
       pci_read_config(dev, PCIR_PROGIF, 1) == PCIP_BRIDGE_PCI_SUBTRACTIVE)
 	sc->flags |= PCIB_SUBTRACTIVE;
 
 #ifdef PCI_HP
     pcib_probe_hotplug(sc);
 #endif
 #ifdef NEW_PCIB
 #ifdef PCI_RES_BUS
     pcib_setup_secbus(dev, &sc->bus, 1);
 #endif
     pcib_probe_windows(sc);
 #endif
 #ifdef PCI_HP
     if (sc->flags & PCIB_HOTPLUG)
 	    pcib_setup_hotplug(sc);
 #endif
     if (bootverbose) {
 	device_printf(dev, "  domain            %d\n", sc->domain);
 	device_printf(dev, "  secondary bus     %d\n", sc->bus.sec);
 	device_printf(dev, "  subordinate bus   %d\n", sc->bus.sub);
 #ifdef NEW_PCIB
 	if (pcib_is_window_open(&sc->io))
 	    device_printf(dev, "  I/O decode        0x%jx-0x%jx\n",
 	      (uintmax_t)sc->io.base, (uintmax_t)sc->io.limit);
 	if (pcib_is_window_open(&sc->mem))
 	    device_printf(dev, "  memory decode     0x%jx-0x%jx\n",
 	      (uintmax_t)sc->mem.base, (uintmax_t)sc->mem.limit);
 	if (pcib_is_window_open(&sc->pmem))
 	    device_printf(dev, "  prefetched decode 0x%jx-0x%jx\n",
 	      (uintmax_t)sc->pmem.base, (uintmax_t)sc->pmem.limit);
 #else
 	if (pcib_is_io_open(sc))
 	    device_printf(dev, "  I/O decode        0x%x-0x%x\n",
 	      sc->iobase, sc->iolimit);
 	if (pcib_is_nonprefetch_open(sc))
 	    device_printf(dev, "  memory decode     0x%jx-0x%jx\n",
 	      (uintmax_t)sc->membase, (uintmax_t)sc->memlimit);
 	if (pcib_is_prefetch_open(sc))
 	    device_printf(dev, "  prefetched decode 0x%jx-0x%jx\n",
 	      (uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit);
 #endif
 	if (sc->bridgectl & (PCIB_BCR_ISA_ENABLE | PCIB_BCR_VGA_ENABLE) ||
 	    sc->flags & PCIB_SUBTRACTIVE) {
 		device_printf(dev, "  special decode    ");
 		comma = 0;
 		if (sc->bridgectl & PCIB_BCR_ISA_ENABLE) {
 			printf("ISA");
 			comma = 1;
 		}
 		if (sc->bridgectl & PCIB_BCR_VGA_ENABLE) {
 			printf("%sVGA", comma ? ", " : "");
 			comma = 1;
 		}
 		if (sc->flags & PCIB_SUBTRACTIVE)
 			printf("%ssubtractive", comma ? ", " : "");
 		printf("\n");
 	}
     }
 
     /*
      * Always enable busmastering on bridges so that transactions
      * initiated on the secondary bus are passed through to the
      * primary bus.
      */
     pci_enable_busmaster(dev);
 }
 
 #ifdef PCI_HP
 static int
 pcib_present(struct pcib_softc *sc)
 {
 
 	if (sc->flags & PCIB_HOTPLUG)
 		return (pcib_hotplug_present(sc) != 0);
 	return (1);
 }
 #endif
 
 int
 pcib_attach_child(device_t dev)
 {
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(dev);
 	if (sc->bus.sec == 0) {
 		/* no secondary bus; we should have fixed this */
 		return(0);
 	}
 
 #ifdef PCI_HP
 	if (!pcib_present(sc)) {
 		/* An empty HotPlug slot, so don't add a PCI bus yet. */
 		return (0);
 	}
 #endif
 
 	sc->child = device_add_child(dev, "pci", -1);
 	return (bus_generic_attach(dev));
 }
 
 int
 pcib_attach(device_t dev)
 {
 
     pcib_attach_common(dev);
     return (pcib_attach_child(dev));
 }
 
 int
 pcib_detach(device_t dev)
 {
 #if defined(PCI_HP) || defined(NEW_PCIB)
 	struct pcib_softc *sc;
 #endif
 	int error;
 
 #if defined(PCI_HP) || defined(NEW_PCIB)
 	sc = device_get_softc(dev);
 #endif
 	error = bus_generic_detach(dev);
 	if (error)
 		return (error);
 #ifdef PCI_HP
 	if (sc->flags & PCIB_HOTPLUG) {
 		error = pcib_detach_hotplug(sc);
 		if (error)
 			return (error);
 	}
 #endif
 	error = device_delete_children(dev);
 	if (error)
 		return (error);
 #ifdef NEW_PCIB
 	pcib_free_windows(sc);
 #ifdef PCI_RES_BUS
 	pcib_free_secbus(dev, &sc->bus);
 #endif
 #endif
 	return (0);
 }
 
 int
 pcib_suspend(device_t dev)
 {
 
 	pcib_cfg_save(device_get_softc(dev));
 	return (bus_generic_suspend(dev));
 }
 
 int
 pcib_resume(device_t dev)
 {
 
 	pcib_cfg_restore(device_get_softc(dev));
 	return (bus_generic_resume(dev));
 }
 
 void
 pcib_bridge_init(device_t dev)
 {
 	pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1);
 	pci_write_config(dev, PCIR_IOBASEH_1, 0xffff, 2);
 	pci_write_config(dev, PCIR_IOLIMITL_1, 0, 1);
 	pci_write_config(dev, PCIR_IOLIMITH_1, 0, 2);
 	pci_write_config(dev, PCIR_MEMBASE_1, 0xffff, 2);
 	pci_write_config(dev, PCIR_MEMLIMIT_1, 0, 2);
 	pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2);
 	pci_write_config(dev, PCIR_PMBASEH_1, 0xffffffff, 4);
 	pci_write_config(dev, PCIR_PMLIMITL_1, 0, 2);
 	pci_write_config(dev, PCIR_PMLIMITH_1, 0, 4);
 }
 
 int
 pcib_child_present(device_t dev, device_t child)
 {
 #ifdef PCI_HP
 	struct pcib_softc *sc = device_get_softc(dev);
 	int retval;
 
 	retval = bus_child_present(dev);
 	if (retval != 0 && sc->flags & PCIB_HOTPLUG)
 		retval = pcib_hotplug_present(sc);
 	return (retval);
 #else
 	return (bus_child_present(dev));
 #endif
 }
 
 int
 pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *result)
 {
     struct pcib_softc	*sc = device_get_softc(dev);
 
     switch (which) {
     case PCIB_IVAR_DOMAIN:
 	*result = sc->domain;
 	return(0);
     case PCIB_IVAR_BUS:
 	*result = sc->bus.sec;
 	return(0);
     }
     return(ENOENT);
 }
 
 int
 pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t value)
 {
 
     switch (which) {
     case PCIB_IVAR_DOMAIN:
 	return(EINVAL);
     case PCIB_IVAR_BUS:
 	return(EINVAL);
     }
     return(ENOENT);
 }
 
 #ifdef NEW_PCIB
 /*
  * Attempt to allocate a resource from the existing resources assigned
  * to a window.
  */
 static struct resource *
 pcib_suballoc_resource(struct pcib_softc *sc, struct pcib_window *w,
     device_t child, int type, int *rid, rman_res_t start, rman_res_t end,
     rman_res_t count, u_int flags)
 {
 	struct resource *res;
 
 	if (!pcib_is_window_open(w))
 		return (NULL);
 
 	res = rman_reserve_resource(&w->rman, start, end, count,
 	    flags & ~RF_ACTIVE, child);
 	if (res == NULL)
 		return (NULL);
 
 	if (bootverbose)
 		device_printf(sc->dev,
 		    "allocated %s range (%#jx-%#jx) for rid %x of %s\n",
 		    w->name, rman_get_start(res), rman_get_end(res), *rid,
 		    pcib_child_name(child));
 	rman_set_rid(res, *rid);
 
 	/*
 	 * If the resource should be active, pass that request up the
 	 * tree.  This assumes the parent drivers can handle
 	 * activating sub-allocated resources.
 	 */
 	if (flags & RF_ACTIVE) {
 		if (bus_activate_resource(child, type, *rid, res) != 0) {
 			rman_release_resource(res);
 			return (NULL);
 		}
 	}
 
 	return (res);
 }
 
 /* Allocate a fresh resource range for an unconfigured window. */
 static int
 pcib_alloc_new_window(struct pcib_softc *sc, struct pcib_window *w, int type,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource *res;
 	rman_res_t base, limit, wmask;
 	int rid;
 
 	/*
 	 * If this is an I/O window on a bridge with ISA enable set
 	 * and the start address is below 64k, then try to allocate an
 	 * initial window of 0x1000 bytes long starting at address
 	 * 0xf000 and walking down.  Note that if the original request
 	 * was larger than the non-aliased range size of 0x100 our
 	 * caller would have raised the start address up to 64k
 	 * already.
 	 */
 	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
 	    start < 65536) {
 		for (base = 0xf000; (long)base >= 0; base -= 0x1000) {
 			limit = base + 0xfff;
 
 			/*
 			 * Skip ranges that wouldn't work for the
 			 * original request.  Note that the actual
 			 * window that overlaps are the non-alias
 			 * ranges within [base, limit], so this isn't
 			 * quite a simple comparison.
 			 */
 			if (start + count > limit - 0x400)
 				continue;
 			if (base == 0) {
 				/*
 				 * The first open region for the window at
 				 * 0 is 0x400-0x4ff.
 				 */
 				if (end - count + 1 < 0x400)
 					continue;
 			} else {
 				if (end - count + 1 < base)
 					continue;
 			}
 
 			if (pcib_alloc_nonisa_ranges(sc, base, limit) == 0) {
 				w->base = base;
 				w->limit = limit;
 				return (0);
 			}
 		}
 		return (ENOSPC);
 	}
 
 	wmask = ((rman_res_t)1 << w->step) - 1;
 	if (RF_ALIGNMENT(flags) < w->step) {
 		flags &= ~RF_ALIGNMENT_MASK;
 		flags |= RF_ALIGNMENT_LOG2(w->step);
 	}
 	start &= ~wmask;
 	end |= wmask;
 	count = roundup2(count, (rman_res_t)1 << w->step);
 	rid = w->reg;
 	res = bus_alloc_resource(sc->dev, type, &rid, start, end, count,
 	    flags & ~RF_ACTIVE);
 	if (res == NULL)
 		return (ENOSPC);
 	pcib_add_window_resources(w, &res, 1);
 	pcib_activate_window(sc, type);
 	w->base = rman_get_start(res);
 	w->limit = rman_get_end(res);
 	return (0);
 }
 
 /* Try to expand an existing window to the requested base and limit. */
 static int
 pcib_expand_window(struct pcib_softc *sc, struct pcib_window *w, int type,
     rman_res_t base, rman_res_t limit)
 {
 	struct resource *res;
 	int error, i, force_64k_base;
 
 	KASSERT(base <= w->base && limit >= w->limit,
 	    ("attempting to shrink window"));
 
 	/*
 	 * XXX: pcib_grow_window() doesn't try to do this anyway and
 	 * the error handling for all the edge cases would be tedious.
 	 */
 	KASSERT(limit == w->limit || base == w->base,
 	    ("attempting to grow both ends of a window"));
 
 	/*
 	 * Yet more special handling for requests to expand an I/O
 	 * window behind an ISA-enabled bridge.  Since I/O windows
 	 * have to grow in 0x1000 increments and the end of the 0xffff
 	 * range is an alias, growing a window below 64k will always
 	 * result in allocating new resources and never adjusting an
 	 * existing resource.
 	 */
 	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
 	    (limit <= 65535 || (base <= 65535 && base != w->base))) {
 		KASSERT(limit == w->limit || limit <= 65535,
 		    ("attempting to grow both ends across 64k ISA alias"));
 
 		if (base != w->base)
 			error = pcib_alloc_nonisa_ranges(sc, base, w->base - 1);
 		else
 			error = pcib_alloc_nonisa_ranges(sc, w->limit + 1,
 			    limit);
 		if (error == 0) {
 			w->base = base;
 			w->limit = limit;
 		}
 		return (error);
 	}
 
 	/*
 	 * Find the existing resource to adjust.  Usually there is only one,
 	 * but for an ISA-enabled bridge we might be growing the I/O window
 	 * above 64k and need to find the existing resource that maps all
 	 * of the area above 64k.
 	 */
 	for (i = 0; i < w->count; i++) {
 		if (rman_get_end(w->res[i]) == w->limit)
 			break;
 	}
 	KASSERT(i != w->count, ("did not find existing resource"));
 	res = w->res[i];
 
 	/*
 	 * Usually the resource we found should match the window's
 	 * existing range.  The one exception is the ISA-enabled case
 	 * mentioned above in which case the resource should start at
 	 * 64k.
 	 */
 	if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE &&
 	    w->base <= 65535) {
 		KASSERT(rman_get_start(res) == 65536,
 		    ("existing resource mismatch"));
 		force_64k_base = 1;
 	} else {
 		KASSERT(w->base == rman_get_start(res),
 		    ("existing resource mismatch"));
 		force_64k_base = 0;
 	}
 
 	error = bus_adjust_resource(sc->dev, type, res, force_64k_base ?
 	    rman_get_start(res) : base, limit);
 	if (error)
 		return (error);
 
 	/* Add the newly allocated region to the resource manager. */
 	if (w->base != base) {
 		error = rman_manage_region(&w->rman, base, w->base - 1);
 		w->base = base;
 	} else {
 		error = rman_manage_region(&w->rman, w->limit + 1, limit);
 		w->limit = limit;
 	}
 	if (error) {
 		if (bootverbose)
 			device_printf(sc->dev,
 			    "failed to expand %s resource manager\n", w->name);
 		(void)bus_adjust_resource(sc->dev, type, res, force_64k_base ?
 		    rman_get_start(res) : w->base, w->limit);
 	}
 	return (error);
 }
 
 /*
  * Attempt to grow a window to make room for a given resource request.
  */
 static int
 pcib_grow_window(struct pcib_softc *sc, struct pcib_window *w, int type,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	rman_res_t align, start_free, end_free, front, back, wmask;
 	int error;
 
 	/*
 	 * Clamp the desired resource range to the maximum address
 	 * this window supports.  Reject impossible requests.
 	 *
 	 * For I/O port requests behind a bridge with the ISA enable
 	 * bit set, force large allocations to start above 64k.
 	 */
 	if (!w->valid)
 		return (EINVAL);
 	if (sc->bridgectl & PCIB_BCR_ISA_ENABLE && count > 0x100 &&
 	    start < 65536)
 		start = 65536;
 	if (end > w->rman.rm_end)
 		end = w->rman.rm_end;
 	if (start + count - 1 > end || start + count < start)
 		return (EINVAL);
 	wmask = ((rman_res_t)1 << w->step) - 1;
 
 	/*
 	 * If there is no resource at all, just try to allocate enough
 	 * aligned space for this resource.
 	 */
 	if (w->res == NULL) {
 		error = pcib_alloc_new_window(sc, w, type, start, end, count,
 		    flags);
 		if (error) {
 			if (bootverbose)
 				device_printf(sc->dev,
 		    "failed to allocate initial %s window (%#jx-%#jx,%#jx)\n",
 				    w->name, start, end, count);
 			return (error);
 		}
 		if (bootverbose)
 			device_printf(sc->dev,
 			    "allocated initial %s window of %#jx-%#jx\n",
 			    w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
 		goto updatewin;
 	}
 
 	/*
 	 * See if growing the window would help.  Compute the minimum
 	 * amount of address space needed on both the front and back
 	 * ends of the existing window to satisfy the allocation.
 	 *
 	 * For each end, build a candidate region adjusting for the
 	 * required alignment, etc.  If there is a free region at the
 	 * edge of the window, grow from the inner edge of the free
 	 * region.  Otherwise grow from the window boundary.
 	 *
 	 * Growing an I/O window below 64k for a bridge with the ISA
 	 * enable bit doesn't require any special magic as the step
 	 * size of an I/O window (1k) always includes multiple
 	 * non-alias ranges when it is grown in either direction.
 	 *
 	 * XXX: Special case: if w->res is completely empty and the
 	 * request size is larger than w->res, we should find the
 	 * optimal aligned buffer containing w->res and allocate that.
 	 */
 	if (bootverbose)
 		device_printf(sc->dev,
 		    "attempting to grow %s window for (%#jx-%#jx,%#jx)\n",
 		    w->name, start, end, count);
 	align = (rman_res_t)1 << RF_ALIGNMENT(flags);
 	if (start < w->base) {
 		if (rman_first_free_region(&w->rman, &start_free, &end_free) !=
 		    0 || start_free != w->base)
 			end_free = w->base;
 		if (end_free > end)
 			end_free = end + 1;
 
 		/* Move end_free down until it is properly aligned. */
 		end_free &= ~(align - 1);
 		end_free--;
 		front = end_free - (count - 1);
 
 		/*
 		 * The resource would now be allocated at (front,
 		 * end_free).  Ensure that fits in the (start, end)
 		 * bounds.  end_free is checked above.  If 'front' is
 		 * ok, ensure it is properly aligned for this window.
 		 * Also check for underflow.
 		 */
 		if (front >= start && front <= end_free) {
 			if (bootverbose)
 				printf("\tfront candidate range: %#jx-%#jx\n",
 				    front, end_free);
 			front &= ~wmask;
 			front = w->base - front;
 		} else
 			front = 0;
 	} else
 		front = 0;
 	if (end > w->limit) {
 		if (rman_last_free_region(&w->rman, &start_free, &end_free) !=
 		    0 || end_free != w->limit)
 			start_free = w->limit + 1;
 		if (start_free < start)
 			start_free = start;
 
 		/* Move start_free up until it is properly aligned. */
 		start_free = roundup2(start_free, align);
 		back = start_free + count - 1;
 
 		/*
 		 * The resource would now be allocated at (start_free,
 		 * back).  Ensure that fits in the (start, end)
 		 * bounds.  start_free is checked above.  If 'back' is
 		 * ok, ensure it is properly aligned for this window.
 		 * Also check for overflow.
 		 */
 		if (back <= end && start_free <= back) {
 			if (bootverbose)
 				printf("\tback candidate range: %#jx-%#jx\n",
 				    start_free, back);
 			back |= wmask;
 			back -= w->limit;
 		} else
 			back = 0;
 	} else
 		back = 0;
 
 	/*
 	 * Try to allocate the smallest needed region first.
 	 * If that fails, fall back to the other region.
 	 */
 	error = ENOSPC;
 	while (front != 0 || back != 0) {
 		if (front != 0 && (front <= back || back == 0)) {
 			error = pcib_expand_window(sc, w, type, w->base - front,
 			    w->limit);
 			if (error == 0)
 				break;
 			front = 0;
 		} else {
 			error = pcib_expand_window(sc, w, type, w->base,
 			    w->limit + back);
 			if (error == 0)
 				break;
 			back = 0;
 		}
 	}
 
 	if (error)
 		return (error);
 	if (bootverbose)
 		device_printf(sc->dev, "grew %s window to %#jx-%#jx\n",
 		    w->name, (uintmax_t)w->base, (uintmax_t)w->limit);
 
 updatewin:
 	/* Write the new window. */
 	KASSERT((w->base & wmask) == 0, ("start address is not aligned"));
 	KASSERT((w->limit & wmask) == wmask, ("end address is not aligned"));
 	pcib_write_windows(sc, w->mask);
 	return (0);
 }
 
 /*
  * We have to trap resource allocation requests and ensure that the bridge
  * is set up to, or capable of handling them.
  */
 struct resource *
 pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct pcib_softc *sc;
 	struct resource *r;
 
 	sc = device_get_softc(dev);
 
 	/*
 	 * VGA resources are decoded iff the VGA enable bit is set in
 	 * the bridge control register.  VGA resources do not fall into
 	 * the resource windows and are passed up to the parent.
 	 */
 	if ((type == SYS_RES_IOPORT && pci_is_vga_ioport_range(start, end)) ||
 	    (type == SYS_RES_MEMORY && pci_is_vga_memory_range(start, end))) {
 		if (sc->bridgectl & PCIB_BCR_VGA_ENABLE)
 			return (bus_generic_alloc_resource(dev, child, type,
 			    rid, start, end, count, flags));
 		else
 			return (NULL);
 	}
 
 	switch (type) {
 #ifdef PCI_RES_BUS
 	case PCI_RES_BUS:
 		return (pcib_alloc_subbus(&sc->bus, child, rid, start, end,
 		    count, flags));
 #endif
 	case SYS_RES_IOPORT:
 		if (pcib_is_isa_range(sc, start, end, count))
 			return (NULL);
 		r = pcib_suballoc_resource(sc, &sc->io, child, type, rid, start,
 		    end, count, flags);
 		if (r != NULL || (sc->flags & PCIB_SUBTRACTIVE) != 0)
 			break;
 		if (pcib_grow_window(sc, &sc->io, type, start, end, count,
 		    flags) == 0)
 			r = pcib_suballoc_resource(sc, &sc->io, child, type,
 			    rid, start, end, count, flags);
 		break;
 	case SYS_RES_MEMORY:
 		/*
 		 * For prefetchable resources, prefer the prefetchable
 		 * memory window, but fall back to the regular memory
 		 * window if that fails.  Try both windows before
 		 * attempting to grow a window in case the firmware
 		 * has used a range in the regular memory window to
 		 * map a prefetchable BAR.
 		 */
 		if (flags & RF_PREFETCHABLE) {
 			r = pcib_suballoc_resource(sc, &sc->pmem, child, type,
 			    rid, start, end, count, flags);
 			if (r != NULL)
 				break;
 		}
 		r = pcib_suballoc_resource(sc, &sc->mem, child, type, rid,
 		    start, end, count, flags);
 		if (r != NULL || (sc->flags & PCIB_SUBTRACTIVE) != 0)
 			break;
 		if (flags & RF_PREFETCHABLE) {
 			if (pcib_grow_window(sc, &sc->pmem, type, start, end,
 			    count, flags) == 0) {
 				r = pcib_suballoc_resource(sc, &sc->pmem, child,
 				    type, rid, start, end, count, flags);
 				if (r != NULL)
 					break;
 			}
 		}
 		if (pcib_grow_window(sc, &sc->mem, type, start, end, count,
 		    flags & ~RF_PREFETCHABLE) == 0)
 			r = pcib_suballoc_resource(sc, &sc->mem, child, type,
 			    rid, start, end, count, flags);
 		break;
 	default:
 		return (bus_generic_alloc_resource(dev, child, type, rid,
 		    start, end, count, flags));
 	}
 
 	/*
 	 * If attempts to suballocate from the window fail but this is a
 	 * subtractive bridge, pass the request up the tree.
 	 */
 	if (sc->flags & PCIB_SUBTRACTIVE && r == NULL)
 		return (bus_generic_alloc_resource(dev, child, type, rid,
 		    start, end, count, flags));
 	return (r);
 }
 
 int
 pcib_adjust_resource(device_t bus, device_t child, int type, struct resource *r,
     rman_res_t start, rman_res_t end)
 {
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(bus);
 	if (pcib_is_resource_managed(sc, type, r))
 		return (rman_adjust_resource(r, start, end));
 	return (bus_generic_adjust_resource(bus, child, type, r, start, end));
 }
 
 int
 pcib_release_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	struct pcib_softc *sc;
 	int error;
 
 	sc = device_get_softc(dev);
 	if (pcib_is_resource_managed(sc, type, r)) {
 		if (rman_get_flags(r) & RF_ACTIVE) {
 			error = bus_deactivate_resource(child, type, rid, r);
 			if (error)
 				return (error);
 		}
 		return (rman_release_resource(r));
 	}
 	return (bus_generic_release_resource(dev, child, type, rid, r));
 }
 #else
 /*
  * We have to trap resource allocation requests and ensure that the bridge
  * is set up to, or capable of handling them.
  */
 struct resource *
 pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct pcib_softc	*sc = device_get_softc(dev);
 	const char *name, *suffix;
 	int ok;
 
 	/*
 	 * Fail the allocation for this range if it's not supported.
 	 */
 	name = device_get_nameunit(child);
 	if (name == NULL) {
 		name = "";
 		suffix = "";
 	} else
 		suffix = " ";
 	switch (type) {
 	case SYS_RES_IOPORT:
 		ok = 0;
 		if (!pcib_is_io_open(sc))
 			break;
 		ok = (start >= sc->iobase && end <= sc->iolimit);
 
 		/*
 		 * Make sure we allow access to VGA I/O addresses when the
 		 * bridge has the "VGA Enable" bit set.
 		 */
 		if (!ok && pci_is_vga_ioport_range(start, end))
 			ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0;
 
 		if ((sc->flags & PCIB_SUBTRACTIVE) == 0) {
 			if (!ok) {
 				if (start < sc->iobase)
 					start = sc->iobase;
 				if (end > sc->iolimit)
 					end = sc->iolimit;
 				if (start < end)
 					ok = 1;
 			}
 		} else {
 			ok = 1;
 #if 0
 			/*
 			 * If we overlap with the subtractive range, then
 			 * pick the upper range to use.
 			 */
 			if (start < sc->iolimit && end > sc->iobase)
 				start = sc->iolimit + 1;
 #endif
 		}
 		if (end < start) {
 			device_printf(dev, "ioport: end (%jx) < start (%jx)\n",
 			    end, start);
 			start = 0;
 			end = 0;
 			ok = 0;
 		}
 		if (!ok) {
 			device_printf(dev, "%s%srequested unsupported I/O "
 			    "range 0x%jx-0x%jx (decoding 0x%x-0x%x)\n",
 			    name, suffix, start, end, sc->iobase, sc->iolimit);
 			return (NULL);
 		}
 		if (bootverbose)
 			device_printf(dev,
 			    "%s%srequested I/O range 0x%jx-0x%jx: in range\n",
 			    name, suffix, start, end);
 		break;
 
 	case SYS_RES_MEMORY:
 		ok = 0;
 		if (pcib_is_nonprefetch_open(sc))
 			ok = ok || (start >= sc->membase && end <= sc->memlimit);
 		if (pcib_is_prefetch_open(sc))
 			ok = ok || (start >= sc->pmembase && end <= sc->pmemlimit);
 
 		/*
 		 * Make sure we allow access to VGA memory addresses when the
 		 * bridge has the "VGA Enable" bit set.
 		 */
 		if (!ok && pci_is_vga_memory_range(start, end))
 			ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0;
 
 		if ((sc->flags & PCIB_SUBTRACTIVE) == 0) {
 			if (!ok) {
 				ok = 1;
 				if (flags & RF_PREFETCHABLE) {
 					if (pcib_is_prefetch_open(sc)) {
 						if (start < sc->pmembase)
 							start = sc->pmembase;
 						if (end > sc->pmemlimit)
 							end = sc->pmemlimit;
 					} else {
 						ok = 0;
 					}
 				} else {	/* non-prefetchable */
 					if (pcib_is_nonprefetch_open(sc)) {
 						if (start < sc->membase)
 							start = sc->membase;
 						if (end > sc->memlimit)
 							end = sc->memlimit;
 					} else {
 						ok = 0;
 					}
 				}
 			}
 		} else if (!ok) {
 			ok = 1;	/* subtractive bridge: always ok */
 #if 0
 			if (pcib_is_nonprefetch_open(sc)) {
 				if (start < sc->memlimit && end > sc->membase)
 					start = sc->memlimit + 1;
 			}
 			if (pcib_is_prefetch_open(sc)) {
 				if (start < sc->pmemlimit && end > sc->pmembase)
 					start = sc->pmemlimit + 1;
 			}
 #endif
 		}
 		if (end < start) {
 			device_printf(dev, "memory: end (%jx) < start (%jx)\n",
 			    end, start);
 			start = 0;
 			end = 0;
 			ok = 0;
 		}
 		if (!ok && bootverbose)
 			device_printf(dev,
 			    "%s%srequested unsupported memory range %#jx-%#jx "
 			    "(decoding %#jx-%#jx, %#jx-%#jx)\n",
 			    name, suffix, start, end,
 			    (uintmax_t)sc->membase, (uintmax_t)sc->memlimit,
 			    (uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit);
 		if (!ok)
 			return (NULL);
 		if (bootverbose)
 			device_printf(dev,"%s%srequested memory range "
 			    "0x%jx-0x%jx: good\n",
 			    name, suffix, start, end);
 		break;
 
 	default:
 		break;
 	}
 	/*
 	 * Bridge is OK decoding this resource, so pass it up.
 	 */
 	return (bus_generic_alloc_resource(dev, child, type, rid, start, end,
 	    count, flags));
 }
 #endif
 
 /*
  * If ARI is enabled on this downstream port, translate the function number
  * to the non-ARI slot/function.  The downstream port will convert it back in
  * hardware.  If ARI is not enabled slot and func are not modified.
  */
 static __inline void
 pcib_xlate_ari(device_t pcib, int bus, int *slot, int *func)
 {
 	struct pcib_softc *sc;
 	int ari_func;
 
 	sc = device_get_softc(pcib);
 	ari_func = *func;
 
 	if (sc->flags & PCIB_ENABLE_ARI) {
 		KASSERT(*slot == 0,
 		    ("Non-zero slot number with ARI enabled!"));
 		*slot = PCIE_ARI_SLOT(ari_func);
 		*func = PCIE_ARI_FUNC(ari_func);
 	}
 }
 
 
 static void
 pcib_enable_ari(struct pcib_softc *sc, uint32_t pcie_pos)
 {
 	uint32_t ctl2;
 
 	ctl2 = pci_read_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, 4);
 	ctl2 |= PCIEM_CTL2_ARI;
 	pci_write_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, ctl2, 4);
 
 	sc->flags |= PCIB_ENABLE_ARI;
 }
 
 /*
  * PCIB interface.
  */
 int
 pcib_maxslots(device_t dev)
 {
 	return (PCI_SLOTMAX);
 }
 
 static int
 pcib_ari_maxslots(device_t dev)
 {
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->flags & PCIB_ENABLE_ARI)
 		return (PCIE_ARI_SLOTMAX);
 	else
 		return (PCI_SLOTMAX);
 }
 
 static int
 pcib_ari_maxfuncs(device_t dev)
 {
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->flags & PCIB_ENABLE_ARI)
 		return (PCIE_ARI_FUNCMAX);
 	else
 		return (PCI_FUNCMAX);
 }
 
 static void
 pcib_ari_decode_rid(device_t pcib, uint16_t rid, int *bus, int *slot,
     int *func)
 {
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(pcib);
 
 	*bus = PCI_RID2BUS(rid);
 	if (sc->flags & PCIB_ENABLE_ARI) {
 		*slot = PCIE_ARI_RID2SLOT(rid);
 		*func = PCIE_ARI_RID2FUNC(rid);
 	} else {
 		*slot = PCI_RID2SLOT(rid);
 		*func = PCI_RID2FUNC(rid);
 	}
 }
 
 /*
  * Since we are a child of a PCI bus, its parent must support the pcib interface.
  */
 static uint32_t
 pcib_read_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, int width)
 {
 #ifdef PCI_HP
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(dev);
 	if (!pcib_present(sc)) {
 		switch (width) {
 		case 2:
 			return (0xffff);
 		case 1:
 			return (0xff);
 		default:
 			return (0xffffffff);
 		}
 	}
 #endif
 	pcib_xlate_ari(dev, b, &s, &f);
 	return(PCIB_READ_CONFIG(device_get_parent(device_get_parent(dev)), b, s,
 	    f, reg, width));
 }
 
 static void
 pcib_write_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, uint32_t val, int width)
 {
 #ifdef PCI_HP
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(dev);
 	if (!pcib_present(sc))
 		return;
 #endif
 	pcib_xlate_ari(dev, b, &s, &f);
 	PCIB_WRITE_CONFIG(device_get_parent(device_get_parent(dev)), b, s, f,
 	    reg, val, width);
 }
 
 /*
  * Route an interrupt across a PCI bridge.
  */
 int
 pcib_route_interrupt(device_t pcib, device_t dev, int pin)
 {
     device_t	bus;
     int		parent_intpin;
     int		intnum;
 
     /*
      *
      * The PCI standard defines a swizzle of the child-side device/intpin to
      * the parent-side intpin as follows.
      *
      * device = device on child bus
      * child_intpin = intpin on child bus slot (0-3)
      * parent_intpin = intpin on parent bus slot (0-3)
      *
      * parent_intpin = (device + child_intpin) % 4
      */
     parent_intpin = (pci_get_slot(dev) + (pin - 1)) % 4;
 
     /*
      * Our parent is a PCI bus.  Its parent must export the pcib interface
      * which includes the ability to route interrupts.
      */
     bus = device_get_parent(pcib);
     intnum = PCIB_ROUTE_INTERRUPT(device_get_parent(bus), pcib, parent_intpin + 1);
     if (PCI_INTERRUPT_VALID(intnum) && bootverbose) {
 	device_printf(pcib, "slot %d INT%c is routed to irq %d\n",
 	    pci_get_slot(dev), 'A' + pin - 1, intnum);
     }
     return(intnum);
 }
 
 /* Pass request to alloc MSI/MSI-X messages up to the parent bridge. */
 int
 pcib_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
 {
 	struct pcib_softc *sc = device_get_softc(pcib);
 	device_t bus;
 
 	if (sc->flags & PCIB_DISABLE_MSI)
 		return (ENXIO);
 	bus = device_get_parent(pcib);
 	return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount,
 	    irqs));
 }
 
 /* Pass request to release MSI/MSI-X messages up to the parent bridge. */
 int
 pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
 {
 	device_t bus;
 
 	bus = device_get_parent(pcib);
 	return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs));
 }
 
 /* Pass request to alloc an MSI-X message up to the parent bridge. */
 int
 pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
 {
 	struct pcib_softc *sc = device_get_softc(pcib);
 	device_t bus;
 
 	if (sc->flags & PCIB_DISABLE_MSIX)
 		return (ENXIO);
 	bus = device_get_parent(pcib);
 	return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq));
 }
 
 /* Pass request to release an MSI-X message up to the parent bridge. */
 int
 pcib_release_msix(device_t pcib, device_t dev, int irq)
 {
 	device_t bus;
 
 	bus = device_get_parent(pcib);
 	return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq));
 }
 
 /* Pass request to map MSI/MSI-X message up to parent bridge. */
 int
 pcib_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr,
     uint32_t *data)
 {
 	device_t bus;
 	int error;
 
 	bus = device_get_parent(pcib);
 	error = PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data);
 	if (error)
 		return (error);
 
 	pci_ht_map_msi(pcib, *addr);
 	return (0);
 }
 
 /* Pass request for device power state up to parent bridge. */
 int
 pcib_power_for_sleep(device_t pcib, device_t dev, int *pstate)
 {
 	device_t bus;
 
 	bus = device_get_parent(pcib);
 	return (PCIB_POWER_FOR_SLEEP(bus, dev, pstate));
 }
 
 static int
 pcib_ari_enabled(device_t pcib)
 {
 	struct pcib_softc *sc;
 
 	sc = device_get_softc(pcib);
 
 	return ((sc->flags & PCIB_ENABLE_ARI) != 0);
 }
 
 static int
 pcib_ari_get_id(device_t pcib, device_t dev, enum pci_id_type type,
     uintptr_t *id)
 {
 	struct pcib_softc *sc;
 	device_t bus_dev;
 	uint8_t bus, slot, func;
 
 	if (type != PCI_ID_RID) {
 		bus_dev = device_get_parent(pcib);
 		return (PCIB_GET_ID(device_get_parent(bus_dev), dev, type, id));
 	}
 
 	sc = device_get_softc(pcib);
 
 	if (sc->flags & PCIB_ENABLE_ARI) {
 		bus = pci_get_bus(dev);
 		func = pci_get_function(dev);
 
 		*id = (PCI_ARI_RID(bus, func));
 	} else {
 		bus = pci_get_bus(dev);
 		slot = pci_get_slot(dev);
 		func = pci_get_function(dev);
 
 		*id = (PCI_RID(bus, slot, func));
 	}
 
 	return (0);
 }
 
 /*
  * Check that the downstream port (pcib) and the endpoint device (dev) both
  * support ARI.  If so, enable it and return 0, otherwise return an error.
  */
 static int
 pcib_try_enable_ari(device_t pcib, device_t dev)
 {
 	struct pcib_softc *sc;
 	int error;
 	uint32_t cap2;
 	int ari_cap_off;
 	uint32_t ari_ver;
 	uint32_t pcie_pos;
 
 	sc = device_get_softc(pcib);
 
 	/*
 	 * ARI is controlled in a register in the PCIe capability structure.
 	 * If the downstream port does not have the PCIe capability structure
 	 * then it does not support ARI.
 	 */
 	error = pci_find_cap(pcib, PCIY_EXPRESS, &pcie_pos);
 	if (error != 0)
 		return (ENODEV);
 
 	/* Check that the PCIe port advertises ARI support. */
 	cap2 = pci_read_config(pcib, pcie_pos + PCIER_DEVICE_CAP2, 4);
 	if (!(cap2 & PCIEM_CAP2_ARI))
 		return (ENODEV);
 
 	/*
 	 * Check that the endpoint device advertises ARI support via the ARI
 	 * extended capability structure.
 	 */
 	error = pci_find_extcap(dev, PCIZ_ARI, &ari_cap_off);
 	if (error != 0)
 		return (ENODEV);
 
 	/*
 	 * Finally, check that the endpoint device supports the same version
 	 * of ARI that we do.
 	 */
 	ari_ver = pci_read_config(dev, ari_cap_off, 4);
 	if (PCI_EXTCAP_VER(ari_ver) != PCIB_SUPPORTED_ARI_VER) {
 		if (bootverbose)
 			device_printf(pcib,
 			    "Unsupported version of ARI (%d) detected\n",
 			    PCI_EXTCAP_VER(ari_ver));
 
 		return (ENXIO);
 	}
 
 	pcib_enable_ari(sc, pcie_pos);
 
 	return (0);
 }
Index: user/alc/PQ_LAUNDRY/sys/kern/sched_4bsd.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/sched_4bsd.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/kern/sched_4bsd.c	(revision 303517)
@@ -1,1797 +1,1797 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 int				dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #ifdef SMP
 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
 #else
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #endif
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 
 /*
  * The schedulable entity that runs a context.
  * This is  an extension to the thread structure and is tailored to
  * the requirements of this scheduler.
  * All fields are protected by the scheduler lock.
  */
 struct td_sched {
 	fixpt_t		ts_pctcpu;	/* %cpu during p_swtime. */
 	u_int		ts_estcpu;	/* Estimated cpu utilization. */
 	int		ts_cpticks;	/* Ticks of cpu time. */
 	int		ts_slptime;	/* Seconds !RUNNING. */
 	int		ts_slice;	/* Remaining part of time slice. */
 	int		ts_flags;
 	struct runq	*ts_runq;	/* runq the thread is currently on */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 
 /* flags kept in td_flags */
 #define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
 #define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /* flags kept in ts_flags */
 #define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
 
 #define SKE_RUNQ_PCPU(ts)						\
     ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
     sizeof(struct thread0_storage),
     "increase struct thread0_storage.t0st_sched size");
 
 static struct mtx sched_lock;
 
 static int	realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_slice = 12; /* Thread run time before rescheduling. */
 
 static void	setup_runqs(void);
 static void	schedcpu(void);
 static void	schedcpu_thread(void);
 static void	sched_priority(struct thread *td, u_char prio);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct thread *td);
 static void	resetpriority(struct thread *td);
 static void	resetpriority_thread(struct thread *td);
 #ifdef SMP
 static int	sched_pickcpu(struct thread *td);
 static int	forward_wakeup(int cpunum);
 static void	kick_other_cpu(int pri, int cpuid);
 #endif
 
 static struct kproc_desc sched_kp = {
         "schedcpu",
         schedcpu_thread,
         NULL
 };
 SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
     &sched_kp);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 
 #ifdef SMP
 /*
  * Per-CPU run queues
  */
 static struct runq runq_pcpu[MAXCPU];
 long runq_length[MAXCPU];
 
 static cpuset_t idle_cpus_mask;
 #endif
 
 struct pcpuidlestat {
 	u_int idlecalls;
 	u_int oldidlecalls;
 };
 static DPCPU_DEFINE(struct pcpuidlestat, idlestat);
 
 static void
 setup_runqs(void)
 {
 #ifdef SMP
 	int i;
 
 	for (i = 0; i < MAXCPU; ++i)
 		runq_init(&runq_pcpu[i]);
 #endif
 
 	runq_init(&runq);
 }
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
 
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 #ifdef SMP
 /* Enable forwarding of wakeups to all other cpus */
 static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
     "Kernel SMP");
 
 static int runq_fuzz = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
 
 static int forward_wakeup_enabled = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
 	   &forward_wakeup_enabled, 0,
 	   "Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_requested = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
 	   &forward_wakeups_requested, 0,
 	   "Requests for Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeups_delivered = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
 	   &forward_wakeups_delivered, 0,
 	   "Completed Forwarding of wakeup to idle CPUs");
 
 static int forward_wakeup_use_mask = 1;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
 	   &forward_wakeup_use_mask, 0,
 	   "Use the mask of idle cpus");
 
 static int forward_wakeup_use_loop = 0;
 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
 	   &forward_wakeup_use_loop, 0,
 	   "Use a loop to find idle cpus");
 
 #endif
 #if 0
 static int sched_followon = 0;
 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
 	   &sched_followon, 0,
 	   "allow threads to share a quantum");
 #endif
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on__cpu);
 SDT_PROBE_DEFINE(sched, , , remain__cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
     "struct proc *");
 
 static __inline void
 sched_load_add(void)
 {
 
 	sched_tdcnt++;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 
 static __inline void
 sched_load_rem(void)
 {
 
 	sched_tdcnt--;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * This function is called when a thread is about to be put on run queue
  * because it has been made runnable or its priority has been adjusted.  It
  * determines if the new thread should be immediately preempted to.  If so,
  * it switches to it and eventually returns true.  If not, it returns false
  * so that the caller may place the thread on an appropriate run queue.
  */
 int
 maybe_preempt(struct thread *td)
 {
 #ifdef PREEMPTION
 	struct thread *ctd;
 	int cpri, pri;
 
 	/*
 	 * The new thread should not preempt the current thread if any of the
 	 * following conditions are true:
 	 *
 	 *  - The kernel is in the throes of crashing (panicstr).
 	 *  - The current thread has a higher (numerically lower) or
 	 *    equivalent priority.  Note that this prevents curthread from
 	 *    trying to preempt to itself.
 	 *  - It is too early in the boot for context switches (cold is set).
 	 *  - The current thread has an inhibitor set or is in the process of
 	 *    exiting.  In this case, the current thread is about to switch
 	 *    out anyways, so there's no point in preempting.  If we did,
 	 *    the current thread would not be properly resumed as well, so
 	 *    just avoid that whole landmine.
 	 *  - If the new thread's priority is not a realtime priority and
 	 *    the current thread's priority is not an idle priority and
 	 *    FULL_PREEMPTION is disabled.
 	 *
 	 * If all of these conditions are false, but the current thread is in
 	 * a nested critical section, then we have to defer the preemption
 	 * until we exit the critical section.  Otherwise, switch immediately
 	 * to the new thread.
 	 */
 	ctd = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 			("maybe_preempt: trying to run inhibited thread"));
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
 	    TD_IS_INHIBITED(ctd))
 		return (0);
 #ifndef FULL_PREEMPTION
 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
 		return (0);
 #endif
 
 	if (ctd->td_critnest > 1) {
 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
 		    ctd->td_critnest);
 		ctd->td_owepreempt = 1;
 		return (0);
 	}
 	/*
 	 * Thread is runnable but not yet put on system run queue.
 	 */
 	MPASS(ctd->td_lock == td->td_lock);
 	MPASS(TD_ON_RUNQ(td));
 	TD_SET_RUNNING(td);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_name);
 	mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
 	/*
 	 * td's lock pointer may have changed.  We have to return with it
 	 * locked.
 	 */
 	spinlock_enter();
 	thread_unlock(ctd);
 	thread_lock(td);
 	spinlock_exit();
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (ts_estcpu) usage in 5 * loadav time
  *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		ts_estcpu *= decay;
  * will compute
  * 	ts_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void)
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct td_sched *ts;
 	int awake;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			awake = 0;
 			ts = td_get_sched(td);
 			thread_lock(td);
 			/*
 			 * Increment sleep time (if sleeping).  We
 			 * ignore overflow, as above.
 			 */
 			/*
 			 * The td_sched slptimes are not touched in wakeup
 			 * because the thread may not HAVE everything in
 			 * memory? XXX I think this is out of date.
 			 */
 			if (TD_ON_RUNQ(td)) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			} else if (TD_IS_RUNNING(td)) {
 				awake = 1;
 				/* Do not clear TDF_DIDRUN */
 			} else if (td->td_flags & TDF_DIDRUN) {
 				awake = 1;
 				td->td_flags &= ~TDF_DIDRUN;
 			}
 
 			/*
 			 * ts_pctcpu is only for ps and ttyinfo().
 			 */
 			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
 			/*
 			 * If the td_sched has been idle the entire second,
 			 * stop recalculating its priority until
 			 * it wakes up.
 			 */
 			if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 				ts->ts_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ts->ts_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ts->ts_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ts->ts_pctcpu += ((FSCALE - ccpu) *
 				    (ts->ts_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ts->ts_cpticks = 0;
 			}
 			/*
 			 * If there are ANY running threads in this process,
 			 * then don't count it as sleeping.
 			 * XXX: this is broken.
 			 */
 			if (awake) {
 				if (ts->ts_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(td);
 				}
 				ts->ts_slptime = 0;
 			} else
 				ts->ts_slptime++;
 			if (ts->ts_slptime > 1) {
 				thread_unlock(td);
 				continue;
 			}
 			ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu);
 		      	resetpriority(td);
 			resetpriority_thread(td);
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 }
 
 /*
  * Main loop for a kthread that executes schedcpu once a second.
  */
 static void
 schedcpu_thread(void)
 {
 
 	for (;;) {
 		schedcpu();
 		pause("-", hz);
 	}
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay ts_estcpu to zero.
  */
 static void
 updatepri(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t loadfac;
 	unsigned int newcpu;
 
 	ts = td_get_sched(td);
 	loadfac = loadfactor(averunnable.ldavg[0]);
 	if (ts->ts_slptime > 5 * loadfac)
 		ts->ts_estcpu = 0;
 	else {
 		newcpu = ts->ts_estcpu;
 		ts->ts_slptime--;	/* was incremented in schedcpu() */
 		while (newcpu && --ts->ts_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		ts->ts_estcpu = newcpu;
 	}
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct thread *td)
 {
 	u_int newpriority;
 
 	if (td->td_pri_class != PRI_TIMESHARE)
 		return;
 	newpriority = PUSER +
 	    td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT +
 	    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
 	newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 	    PRI_MAX_TIMESHARE);
 	sched_user_prio(td, newpriority);
 }
 
 /*
  * Update the thread's priority when the associated process's user
  * priority changes.
  */
 static void
 resetpriority_thread(struct thread *td)
 {
 
 	/* Only change threads with a time sharing user priority. */
 	if (td->td_priority < PRI_MIN_TIMESHARE ||
 	    td->td_priority > PRI_MAX_TIMESHARE)
 		return;
 
 	/* XXX the whole needresched thing is broken, but not silly. */
 	maybe_resched(td);
 
 	sched_prio(td, td->td_user_pri);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 
 	setup_runqs();
 
 	/* Account for thread0. */
 	sched_load_add();
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 static void
 sched_initticks(void *dummy)
 {
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / 10;	/* ~100ms */
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 }
 
 /* External interfaces start here */
 
 /*
  * Very early in the boot some setup of scheduler-specific
  * parts of proc0 and of some scheduler resources needs to be done.
  * Called from:
  *  proc0_init()
  */
 void
 schedinit(void)
 {
 
 	/*
 	 * Set up the scheduler specific parts of thread0.
 	 */
 	thread0.td_lock = &sched_lock;
 	td_get_sched(&thread0)->ts_slice = sched_slice;
 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 }
 
 int
 sched_runnable(void)
 {
 #ifdef SMP
 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
 #else
 	return runq_check(&runq);
 #endif
 }
 
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * We adjust the priority of the current process.  The priority of a
  * process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (ts_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time ts_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached).  The
  * cpu usage estimator ramps up quite quickly when the process is
  * running (linearly), and decays away exponentially, at a rate which
  * is proportionally slower when the system is busy.  The basic
  * principle is that the system will 90% forget that the process used
  * a lot of CPU time in 5 * loadav seconds.  This causes the system to
  * favor processes which haven't run much recently, and to round-robin
  * among other processes.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct pcpuidlestat *stat;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 
 	ts->ts_cpticks++;
 	ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1);
 	if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(td);
 		resetpriority_thread(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
 		ts->ts_slice = sched_slice;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 
 	stat = DPCPU_PTR(idlestat);
 	stat->oldidlecalls = stat->idlecalls;
 	stat->idlecalls = 0;
 }
 
 /*
  * Charge child's scheduling CPU usage to parent.
  */
 void
 sched_exit(struct proc *p, struct thread *td)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
 	    "prio:%d", td->td_priority);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
 	    "prio:%d", child->td_priority);
 	thread_lock(td);
 	td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu +
 	    td_get_sched(child)->ts_estcpu);
 	thread_unlock(td);
 	thread_lock(child);
 	if ((child->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 	thread_unlock(child);
 }
 
 void
 sched_fork(struct thread *td, struct thread *childtd)
 {
 	sched_fork_thread(td, childtd);
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *childtd)
 {
 	struct td_sched *ts, *tsc;
 
 	childtd->td_oncpu = NOCPU;
 	childtd->td_lastcpu = NOCPU;
 	childtd->td_lock = &sched_lock;
 	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
 	childtd->td_priority = childtd->td_base_pri;
 	ts = td_get_sched(childtd);
 	bzero(ts, sizeof(*ts));
 	tsc = td_get_sched(td);
 	ts->ts_estcpu = tsc->ts_estcpu;
 	ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY);
 	ts->ts_slice = 1;
 }
 
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		resetpriority(td);
 		resetpriority_thread(td);
 		thread_unlock(td);
 	}
 }
 
 void
 sched_class(struct thread *td, int class)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  */
 static void
 sched_priority(struct thread *td, u_char prio)
 {
 
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	}
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	td->td_priority = prio;
 	if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 	}
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regulary priority is less
  * important than prio the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_prio(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't ever
 	 * lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_sleep(struct thread *td, int pri)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_slptick = ticks;
 	td_get_sched(td)->ts_slptime = 0;
 	if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 	if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 }
 
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct mtx *tmtx;
 	struct td_sched *ts;
 	struct proc *p;
 	int preempted;
 
 	tmtx = NULL;
 	ts = td_get_sched(td);
 	p = td->td_proc;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* 
 	 * Switch to the sched lock to fix things up and pick
 	 * a new thread.
 	 * Block the td_lock in order to avoid breaking the critical path.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		tmtx = thread_lock_block(td);
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 
 	td->td_lastcpu = td->td_oncpu;
 	preempted = !((td->td_flags & TDF_SLICEEND) ||
 	    (flags & SWT_RELINQUISH));
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	td->td_oncpu = NOCPU;
 
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.  We never put the idle
 	 * threads on the run queue, however.
 	 */
 	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
 #ifdef SMP
 		CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	} else {
 		if (TD_IS_RUNNING(td)) {
 			/* Put us back on the run queue. */
 			sched_add(td, preempted ?
 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 			    SRQ_OURSELF|SRQ_YIELDING);
 		}
 	}
 	if (newtd) {
 		/*
 		 * The thread we are about to run needs to be counted
 		 * as if it had been added to the run queue and selected.
 		 * It came from:
 		 * * A preemption
 		 * * An upcall
 		 * * A followon
 		 */
 		KASSERT((newtd->td_inhibitors == 0),
 			("trying to run inhibited thread"));
 		newtd->td_flags |= TDF_DIDRUN;
         	TD_SET_RUNNING(newtd);
 		if ((newtd->td_flags & TDF_NOLOAD) == 0)
 			sched_load_add();
 	} else {
 		newtd = choosethread();
 		MPASS(newtd->td_lock == &sched_lock);
 	}
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
 		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 
                 /* I feel sleepy */
 		lock_profile_release_lock(&sched_lock.lock_object);
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
 		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
 		lock_profile_obtain_lock_success(&sched_lock.lock_object,
 		    0, 0, __FILE__, __LINE__);
 		/*
 		 * Where am I?  What year is it?
 		 * We are in the same thread that went to sleep above,
 		 * but any amount of time may have passed. All our context
 		 * will still be available as will local variables.
 		 * PCPU values however may have changed as we may have
 		 * changed CPU so don't trust cached values of them.
 		 * New threads will go to fork_exit() instead of here
 		 * so if you change things here you may need to change
 		 * things there too.
 		 *
 		 * If the thread above was exiting it will never wake
 		 * up again here, so either it has saved everything it
 		 * needed to, or the thread_wait() or wait() will
 		 * need to reap it.
 		 */
 
 		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else
 		SDT_PROBE0(sched, , , remain__cpu);
 
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
 		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
 #endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
 	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	td->td_flags &= ~TDF_CANSWAP;
 	if (ts->ts_slptime > 1) {
 		updatepri(td);
 		resetpriority(td);
 	}
 	td->td_slptick = 0;
 	ts->ts_slptime = 0;
 	ts->ts_slice = sched_slice;
 	sched_add(td, SRQ_BORING);
 }
 
 #ifdef SMP
 static int
 forward_wakeup(int cpunum)
 {
 	struct pcpu *pc;
 	cpuset_t dontuse, map, map2;
 	u_int id, me;
 	int iscpuset;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	CTR0(KTR_RUNQ, "forward_wakeup()");
 
 	if ((!forward_wakeup_enabled) ||
 	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
 		return (0);
 	if (!smp_started || cold || panicstr)
 		return (0);
 
 	forward_wakeups_requested++;
 
 	/*
 	 * Check the idle mask we received against what we calculated
 	 * before in the old version.
 	 */
 	me = PCPU_GET(cpuid);
 
 	/* Don't bother if we should be doing it ourself. */
 	if (CPU_ISSET(me, &idle_cpus_mask) &&
 	    (cpunum == NOCPU || me == cpunum))
 		return (0);
 
 	CPU_SETOF(me, &dontuse);
 	CPU_OR(&dontuse, &stopped_cpus);
 	CPU_OR(&dontuse, &hlt_cpus_mask);
 	CPU_ZERO(&map2);
 	if (forward_wakeup_use_loop) {
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &dontuse) &&
 			    pc->pc_curthread == pc->pc_idlethread) {
 				CPU_SET(id, &map2);
 			}
 		}
 	}
 
 	if (forward_wakeup_use_mask) {
 		map = idle_cpus_mask;
 		CPU_NAND(&map, &dontuse);
 
 		/* If they are both on, compare and use loop if different. */
 		if (forward_wakeup_use_loop) {
 			if (CPU_CMP(&map, &map2)) {
 				printf("map != map2, loop method preferred\n");
 				map = map2;
 			}
 		}
 	} else {
 		map = map2;
 	}
 
 	/* If we only allow a specific CPU, then mask off all the others. */
 	if (cpunum != NOCPU) {
 		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
 		iscpuset = CPU_ISSET(cpunum, &map);
 		if (iscpuset == 0)
 			CPU_ZERO(&map);
 		else
 			CPU_SETOF(cpunum, &map);
 	}
 	if (!CPU_EMPTY(&map)) {
 		forward_wakeups_delivered++;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			id = pc->pc_cpuid;
 			if (!CPU_ISSET(id, &map))
 				continue;
 			if (cpu_idle_wakeup(pc->pc_cpuid))
 				CPU_CLR(id, &map);
 		}
 		if (!CPU_EMPTY(&map))
 			ipi_selected(map, IPI_AST);
 		return (1);
 	}
 	if (cpunum == NOCPU)
 		printf("forward_wakeup: Idle processor not found\n");
 	return (0);
 }
 
 static void
 kick_other_cpu(int pri, int cpuid)
 {
 	struct pcpu *pcpu;
 	int cpri;
 
 	pcpu = pcpu_find(cpuid);
 	if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
 		forward_wakeups_delivered++;
 		if (!cpu_idle_wakeup(cpuid))
 			ipi_cpu(cpuid, IPI_AST);
 		return;
 	}
 
 	cpri = pcpu->pc_curthread->td_priority;
 	if (pri >= cpri)
 		return;
 
 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
 #if !defined(FULL_PREEMPTION)
 	if (pri <= PRI_MAX_ITHD)
 #endif /* ! FULL_PREEMPTION */
 	{
 		ipi_cpu(cpuid, IPI_PREEMPT);
 		return;
 	}
 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
 
 	pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
 	ipi_cpu(cpuid, IPI_AST);
 	return;
 }
 #endif /* SMP */
 
 #ifdef SMP
 static int
 sched_pickcpu(struct thread *td)
 {
 	int best, cpu;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
-	if (THREAD_CAN_SCHED(td, td->td_lastcpu))
+	if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu))
 		best = td->td_lastcpu;
 	else
 		best = NOCPU;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu))
 			continue;
 	
 		if (best == NOCPU)
 			best = cpu;
 		else if (runq_length[cpu] < runq_length[best])
 			best = cpu;
 	}
 	KASSERT(best != NOCPU, ("no valid CPUs"));
 
 	return (best);
 }
 #endif
 
 void
 sched_add(struct thread *td, int flags)
 #ifdef SMP
 {
 	cpuset_t tidlemsk;
 	struct td_sched *ts;
 	u_int cpu, cpuid;
 	int forwarded = 0;
 	int single_cpu = 0;
 
 	ts = td_get_sched(td);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 
 	/*
 	 * If SMP is started and the thread is pinned or otherwise limited to
 	 * a specific set of CPUs, queue the thread to a per-CPU run queue.
 	 * Otherwise, queue the thread to the global run queue.
 	 *
 	 * If SMP has not yet been started we must use the global run queue
 	 * as per-CPU state may not be initialized yet and we may crash if we
 	 * try to access the per-CPU run queues.
 	 */
 	if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
 	    ts->ts_flags & TSF_AFFINITY)) {
 		if (td->td_pinned != 0)
 			cpu = td->td_lastcpu;
 		else if (td->td_flags & TDF_BOUND) {
 			/* Find CPU from bound runq. */
 			KASSERT(SKE_RUNQ_PCPU(ts),
 			    ("sched_add: bound td_sched not on cpu runq"));
 			cpu = ts->ts_runq - &runq_pcpu[0];
 		} else
 			/* Find a valid CPU for our cpuset */
 			cpu = sched_pickcpu(td);
 		ts->ts_runq = &runq_pcpu[cpu];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
 		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
 		    cpu);
 	} else {
 		CTR2(KTR_RUNQ,
 		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
 		    td);
 		cpu = NOCPU;
 		ts->ts_runq = &runq;
 	}
 
 	cpuid = PCPU_GET(cpuid);
 	if (single_cpu && cpu != cpuid) {
 	        kick_other_cpu(td->td_priority, cpu);
 	} else {
 		if (!single_cpu) {
 			tidlemsk = idle_cpus_mask;
 			CPU_NAND(&tidlemsk, &hlt_cpus_mask);
 			CPU_CLR(cpuid, &tidlemsk);
 
 			if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
 			    ((flags & SRQ_INTR) == 0) &&
 			    !CPU_EMPTY(&tidlemsk))
 				forwarded = forward_wakeup(cpu);
 		}
 
 		if (!forwarded) {
 			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
 				return;
 			else
 				maybe_resched(td);
 		}
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	if (cpu != NOCPU)
 		runq_length[cpu]++;
 }
 #else /* SMP */
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
 		thread_lock_set(td, &sched_lock);
 	}
 	TD_SET_RUNQ(td);
 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
 	ts->ts_runq = &runq;
 
 	/*
 	 * If we are yielding (on the way out anyhow) or the thread
 	 * being saved is US, then don't try be smart about preemption
 	 * or kicking off another CPU as it won't help and may hinder.
 	 * In the YIEDLING case, we are about to run whoever is being
 	 * put in the queue anyhow, and in the OURSELF case, we are
 	 * putting ourself on the run queue which also only happens
 	 * when we are about to yield.
 	 */
 	if ((flags & SRQ_YIELDING) == 0) {
 		if (maybe_preempt(td))
 			return;
 	}
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
 	maybe_resched(td);
 }
 #endif /* SMP */
 
 void
 sched_rem(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_rem: thread swapped out"));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
 #ifdef SMP
 	if (ts->ts_runq != &runq)
 		runq_length[ts->ts_runq - runq_pcpu]--;
 #endif
 	runq_remove(ts->ts_runq, td);
 	TD_SET_CAN_RUN(td);
 }
 
 /*
  * Select threads to run.  Note that running threads still consume a
  * slot.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct runq *rq;
 
 	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
 	struct thread *tdcpu;
 
 	rq = &runq;
 	td = runq_choose_fuzz(&runq, runq_fuzz);
 	tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
 
 	if (td == NULL ||
 	    (tdcpu != NULL &&
 	     tdcpu->td_priority < td->td_priority)) {
 		CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
 		     PCPU_GET(cpuid));
 		td = tdcpu;
 		rq = &runq_pcpu[PCPU_GET(cpuid)];
 	} else {
 		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
 	}
 
 #else
 	rq = &runq;
 	td = runq_choose(&runq);
 #endif
 
 	if (td) {
 #ifdef SMP
 		if (td == tdcpu)
 			runq_length[PCPU_GET(cpuid)]--;
 #endif
 		runq_remove(rq, td);
 		td->td_flags |= TDF_DIDRUN;
 
 		KASSERT(td->td_flags & TDF_INMEM,
 		    ("sched_choose: thread swapped out"));
 		return (td);
 	}
 	return (PCPU_GET(idlethread));
 }
 
 void
 sched_preempt(struct thread *td)
 {
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 	thread_lock(td);
 	if (td->td_critnest > 1)
 		td->td_owepreempt = 1;
 	else
 		mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
 	thread_unlock(td);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
 		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
 		thread_unlock(td);
 	}
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 
 	ts = td_get_sched(td);
 
 	td->td_flags |= TDF_BOUND;
 #ifdef SMP
 	ts->ts_runq = &runq_pcpu[cpu];
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 
 	mi_switch(SW_VOL, NULL);
 #endif
 }
 
 void
 sched_unbind(struct thread* td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	td->td_flags &= ~TDF_BOUND;
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_flags & TDF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 }
 
 int
 sched_load(void)
 {
 	return (sched_tdcnt);
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	return (ts->ts_pctcpu);
 }
 
 #ifdef RACCT
 /*
  * Calculates the contribution to the thread cpu usage for the latest
  * (unfinished) second.
  */
 fixpt_t
 sched_pctcpu_delta(struct thread *td)
 {
 	struct td_sched *ts;
 	fixpt_t delta;
 	int realstathz;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	delta = 0;
 	realstathz = stathz ? stathz : hz;
 	if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
 		delta = (realstathz == 100)
 		    ? ((fixpt_t) ts->ts_cpticks) <<
 		    (FSHIFT - CCPU_SHIFT) :
 		    100 * (((fixpt_t) ts->ts_cpticks)
 		    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 		delta = ((FSCALE - ccpu) *
 		    (ts->ts_cpticks *
 		    FSCALE / realstathz)) >> FSHIFT;
 #endif
 	}
 
 	return (delta);
 }
 #endif
 
 u_int
 sched_estcpu(struct thread *td)
 {
 	
 	return (td_get_sched(td)->ts_estcpu);
 }
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct pcpuidlestat *stat;
 
 	THREAD_NO_SLEEPING();
 	stat = DPCPU_PTR(idlestat);
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		while (sched_runnable() == 0) {
 			cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
 			stat->idlecalls++;
 		}
 
 		mtx_lock_spin(&sched_lock);
 		mi_switch(SW_VOL | SWT_IDLE, NULL);
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * A CPU is entering for the first time or a thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	/*
 	 * Correct spinlock nesting.  The idle thread context that we are
 	 * borrowing was created so that it would start out with a single
 	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
 	 * explicitly acquired locks in this function, the nesting count
 	 * is now 2 rather than 1.  Since we are nested, calling
 	 * spinlock_exit() will simply adjust the counts without allowing
 	 * spin lock using code to interrupt us.
 	 */
 	if (td == NULL) {
 		mtx_lock_spin(&sched_lock);
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
 	} else {
 		lock_profile_release_lock(&sched_lock.lock_object);
 		MPASS(td->td_lock == &sched_lock);
 		td->td_lastcpu = td->td_oncpu;
 		td->td_oncpu = NOCPU;
 	}
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	cpu_throw(td, choosethread());	/* doesn't return */
 }
 
 void
 sched_fork_exit(struct thread *td)
 {
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with sched_lock held but not recursed.
 	 */
 	td->td_oncpu = PCPU_GET(cpuid);
 	sched_lock.mtx_lock = (uintptr_t)td;
 	lock_profile_obtain_lock_success(&sched_lock.lock_object,
 	    0, 0, __FILE__, __LINE__);
 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 }
 
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else   
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 	int cpu;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);	
 
 	/*
 	 * Set the TSF_AFFINITY flag if there is at least one CPU this
 	 * thread can't run on.
 	 */
 	ts = td_get_sched(td);
 	ts->ts_flags &= ~TSF_AFFINITY;
 	CPU_FOREACH(cpu) {
 		if (!THREAD_CAN_SCHED(td, cpu)) {
 			ts->ts_flags |= TSF_AFFINITY;
 			break;
 		}
 	}
 
 	/*
 	 * If this thread can run on all CPUs, nothing else to do.
 	 */
 	if (!(ts->ts_flags & TSF_AFFINITY))
 		return;
 
 	/* Pinned threads and bound threads should be left alone. */
 	if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
 		return;
 
 	switch (td->td_state) {
 	case TDS_RUNQ:
 		/*
 		 * If we are on a per-CPU runqueue that is in the set,
 		 * then nothing needs to be done.
 		 */
 		if (ts->ts_runq != &runq &&
 		    THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
 			return;
 
 		/* Put this thread on a valid per-CPU runqueue. */
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 		break;
 	case TDS_RUNNING:
 		/*
 		 * See if our current CPU is in the set.  If not, force a
 		 * context switch.
 		 */
 		if (THREAD_CAN_SCHED(td, td->td_oncpu))
 			return;
 
 		td->td_flags |= TDF_NEEDRESCHED;
 		if (td != curthread)
 			ipi_cpu(cpu, IPI_AST);
 		break;
 	default:
 		break;
 	}
 #endif
 }
Index: user/alc/PQ_LAUNDRY/sys/kern/vfs_aio.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/vfs_aio.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/kern/vfs_aio.c	(revision 303517)
@@ -1,2978 +1,2997 @@
 /*-
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 #include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 FEATURE(aio, "Asynchronous I/O");
 
 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
     "Async IO management");
 
 static int enable_aio_unsafe = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
     "Permit asynchronous IO on all file types, not just known-safe types");
 
 static unsigned int unsafe_warningcnt = 1;
 SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
     &unsafe_warningcnt, 0,
     "Warnings that will be triggered upon failed IO requests on unsafe files");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
     "Maximum number of kernel processes to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
     "Number of presently active kernel processes for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
     0,
     "Preferred number of ready kernel processes for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 /* Number of async I/O processes in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0,
     "Maximum active aio requests per process (stored in the process)");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process (stored in the process)");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process (stored in the process)");
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 #endif
 
 /*
  * Below is a key of locks used to protect each member of struct kaiocb
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * If the routine that services an AIO request blocks while running in an
  * AIO kernel process it can starve other I/O requests.  BIO requests
  * queued via aio_qphysio() complete in GEOM and do not use AIO kernel
  * processes at all.  Socket I/O requests use a separate pool of
  * kprocs and also force non-blocking I/O.  Other file I/O requests
  * use the generic fo_read/fo_write operations which can block.  The
  * fsync and mlock operations can also block while executing.  Ideally
  * none of these requests would block while executing.
  *
  * Note that the service routines cannot toggle O_NONBLOCK in the file
  * structure directly while handling a request due to races with
  * userland threads.
  */
 
 /* jobflags */
 #define	KAIOCB_QUEUEING		0x01
 #define	KAIOCB_CANCELLED	0x02
 #define	KAIOCB_CANCELLING	0x04
 #define	KAIOCB_CHECKSYNC	0x08
 #define	KAIOCB_CLEARED		0x10
 #define	KAIOCB_FINISHED		0x20
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aioproc {
 	int	aioprocflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
 	struct	proc *aioproc;			/* (*) the AIO proc */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) listio flags */
 	int	lioj_finished_count;		/* (a) listio flags */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct	knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct	mtx kaio_mtx;		/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
 	int	kaio_buffer_count;	/* (a) number of physio buffers */
 	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
 	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
 	struct	task kaio_task;		/* (*) task to kick aio processes */
 	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
 
 /*
  * Operations used to interact with userland aio control blocks.
  * Different ABIs provide their own operations.
  */
 struct aiocb_ops {
 	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
 	long	(*fetch_status)(struct aiocb *ujob);
 	long	(*fetch_error)(struct aiocb *ujob);
 	int	(*store_status)(struct aiocb *ujob, long status);
 	int	(*store_error)(struct aiocb *ujob, long error);
 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
 };
 
 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
 static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_physwakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p,
 		    struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct kaiocb *job);
 static void	aio_daemon(void *param);
 static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
+static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiop	async io process data
  *	aiocb	async io jobs
  *	aiol	list io job pointer - internal to aio_suspend XXX
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
 
 /* kqueue filters for aio */
 static struct filterops aio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_aioattach,
 	.f_detach = filt_aiodetach,
 	.f_event = filt_aio,
 };
 static struct filterops lio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_lioattach,
 	.f_detach = filt_liodetach,
 	.f_event = filt_lio
 };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_kick);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static int
 aio_onceonly(void)
 {
 
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
 	ki->kaio_flags = 0;
 	ki->kaio_maxactive_count = max_aio_per_proc;
 	ki->kaio_active_count = 0;
 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
 	ki->kaio_count = 0;
 	ki->kaio_ballowed_count = max_buf_aio;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TAILQ_INIT(&ki->kaio_syncready);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td;
 	int error;
 
 	error = sigev_findtd(p, sigev, &td);
 	if (error)
 		return (error);
 	if (!KSI_ONQ(ksi)) {
 		ksiginfo_set_sigev(ksi, sigev);
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = job->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, job, plist);
 	TAILQ_REMOVE(&ki->kaio_all, job, allist);
 
 	lj = job->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* job is going away, we need to destroy any knotes */
 	knlist_delete(&job->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&job->ksi);
 	PROC_UNLOCK(p);
 
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * a kaiocb from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	if (job->fd_file)
 		fdrop(job->fd_file, curthread);
 	crfree(job->cred);
 	uma_zfree(aiocb_zone, job);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p,
     struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 static int
 aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
 {
 	aio_cancel_fn_t *func;
 	int cancelled;
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
 		return (0);
 	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
 	job->jobflags |= KAIOCB_CANCELLED;
 
 	func = job->cancel_fn;
 
 	/*
 	 * If there is no cancel routine, just leave the job marked as
 	 * cancelled.  The job should be in active use by a caller who
 	 * should complete it normally or when it fails to install a
 	 * cancel routine.
 	 */
 	if (func == NULL)
 		return (0);
 
 	/*
 	 * Set the CANCELLING flag so that aio_complete() will defer
 	 * completions of this job.  This prevents the job from being
 	 * freed out from under the cancel callback.  After the
 	 * callback any deferred completion (whether from the callback
 	 * or any other source) will be completed.
 	 */
 	job->jobflags |= KAIOCB_CANCELLING;
 	AIO_UNLOCK(ki);
 	func(job);
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_CANCELLING;
 	if (job->jobflags & KAIOCB_FINISHED) {
 		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(p, job);
 	} else {
 		/*
 		 * The cancel callback might have scheduled an
 		 * operation to cancel this request, but it is
 		 * only counted as cancelled if the request is
 		 * cancelled when the callback returns.
 		 */
 		cancelled = 0;
 	}
 	return (cancelled);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kaiocb *job, *jobn;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		aio_cancel_job(p, ki, job);
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(job);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
 	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct kaiocb *
 aio_selectjob(struct aioproc *aiop)
 {
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 restart:
 	TAILQ_FOREACH(job, &aio_jobs, list) {
 		userp = job->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
 			TAILQ_REMOVE(&aio_jobs, job, list);
 			if (!aio_clear_cancel_function(job))
 				goto restart;
 
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			break;
 		}
 	}
 	return (job);
 }
 
 /*
  * Move all data to a permanent storage device.  This code
  * simulates the fsync syscall.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp)
 {
 	struct mount *mp;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 drop:
 	return (error);
 }
 
 /*
  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
  * does the I/O request for the non-physio version of the operations.  The
  * normal vn operations are used, and this code should work in all instances
  * for every type of file, including pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process_rw(struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct aiocb *cb;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	ssize_t cnt;
 	long msgsnd_st, msgsnd_end;
 	long msgrcv_st, msgrcv_end;
 	long oublock_st, oublock_end;
 	long inblock_st, inblock_end;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = cb->aio_offset;
 	auio.uio_resid = cb->aio_nbytes;
 	cnt = cb->aio_nbytes;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 
 	msgrcv_st = td->td_ru.ru_msgrcv;
 	msgsnd_st = td->td_ru.ru_msgsnd;
 	inblock_st = td->td_ru.ru_inblock;
 	oublock_st = td->td_ru.ru_oublock;
 
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (cb->aio_lio_opcode == LIO_READ) {
 		auio.uio_rw = UIO_READ;
 		if (auio.uio_resid == 0)
 			error = 0;
 		else
 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		auio.uio_rw = UIO_WRITE;
 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	}
 	msgrcv_end = td->td_ru.ru_msgrcv;
 	msgsnd_end = td->td_ru.ru_msgsnd;
 	inblock_end = td->td_ru.ru_inblock;
 	oublock_end = td->td_ru.ru_oublock;
 
 	job->msgrcv = msgrcv_end - msgrcv_st;
 	job->msgsnd = msgsnd_end - msgsnd_st;
 	job->inblock = inblock_end - inblock_st;
 	job->outblock = oublock_end - oublock_st;
 
 	if ((error) && (auio.uio_resid != cnt)) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	cnt -= auio.uio_resid;
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, cnt, 0);
 }
 
 static void
 aio_process_sync(struct kaiocb *job)
 {
 	struct thread *td = curthread;
 	struct ucred *td_savedcred = td->td_ucred;
 	struct file *fp = job->fd_file;
 	int error = 0;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	td->td_ucred = job->cred;
 	if (fp->f_vnode != NULL)
 		error = aio_fsync_vnode(td, fp->f_vnode);
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, 0, 0);
 }
 
 static void
 aio_process_mlock(struct kaiocb *job)
 {
 	struct aiocb *cb = &job->uaiocb;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	error = vm_mlock(job->userproc, job->cred,
 	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, 0, 0);
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *sjob, *sjobn;
 	int lj_done;
 	bool schedule_fsync;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = job->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
 
 	KNOTE_LOCKED(&job->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL
 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (job->jobflags & KAIOCB_CHECKSYNC) {
 		schedule_fsync = false;
 		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
 			if (job->fd_file != sjob->fd_file ||
 			    job->seqno >= sjob->seqno)
 				continue;
 			if (--sjob->pending > 0)
 				continue;
 			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
-			if (!aio_clear_cancel_function(sjob))
+			if (!aio_clear_cancel_function_locked(sjob))
 				continue;
 			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
 			schedule_fsync = true;
 		}
 		if (schedule_fsync)
 			taskqueue_enqueue(taskqueue_aiod_kick,
 			    &ki->kaio_sync_task);
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 static void
 aio_schedule_fsync(void *context, int pending)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 
 	ki = context;
 	AIO_LOCK(ki);
 	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
 		job = TAILQ_FIRST(&ki->kaio_syncready);
 		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		AIO_LOCK(ki);
 	}
 	AIO_UNLOCK(ki);
 }
 
 bool
 aio_cancel_cleared(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	/*
 	 * The caller should hold the same queue lock held when
 	 * aio_clear_cancel_function() was called and set this flag
 	 * ensuring this check sees an up-to-date value.  However,
 	 * there is no way to assert that.
 	 */
 	ki = job->userproc->p_aioinfo;
 	return ((job->jobflags & KAIOCB_CLEARED) != 0);
 }
 
-bool
-aio_clear_cancel_function(struct kaiocb *job)
+static bool
+aio_clear_cancel_function_locked(struct kaiocb *job)
 {
-	struct kaioinfo *ki;
 
-	ki = job->userproc->p_aioinfo;
-	AIO_LOCK(ki);
+	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	MPASS(job->cancel_fn != NULL);
 	if (job->jobflags & KAIOCB_CANCELLING) {
 		job->jobflags |= KAIOCB_CLEARED;
-		AIO_UNLOCK(ki);
 		return (false);
 	}
 	job->cancel_fn = NULL;
-	AIO_UNLOCK(ki);
 	return (true);
 }
 
 bool
-aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
+aio_clear_cancel_function(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
+	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
-	if (job->jobflags & KAIOCB_CANCELLED) {
-		AIO_UNLOCK(ki);
+	ret = aio_clear_cancel_function_locked(job);
+	AIO_UNLOCK(ki);
+	return (ret);
+}
+
+static bool
+aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
+{
+
+	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
+	if (job->jobflags & KAIOCB_CANCELLED)
 		return (false);
-	}
 	job->cancel_fn = func;
-	AIO_UNLOCK(ki);
 	return (true);
 }
 
+bool
+aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
+{
+	struct kaioinfo *ki;
+	bool ret;
+
+	ki = job->userproc->p_aioinfo;
+	AIO_LOCK(ki);
+	ret = aio_set_cancel_function_locked(job, func);
+	AIO_UNLOCK(ki);
+	return (ret);
+}
+
 void
 aio_complete(struct kaiocb *job, long status, int error)
 {
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	job->uaiocb._aiocb_private.error = error;
 	job->uaiocb._aiocb_private.status = status;
 
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 
 	AIO_LOCK(ki);
 	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
 	    ("duplicate aio_complete"));
 	job->jobflags |= KAIOCB_FINISHED;
 	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(userp, job);
 	}
 	AIO_UNLOCK(ki);
 }
 
 void
 aio_cancel(struct kaiocb *job)
 {
 
 	aio_complete(job, -1, ECANCELED);
 }
 
 void
 aio_switch_vmspace(struct kaiocb *job)
 {
 
 	vmspace_switch_aio(job->userproc->p_vmspace);
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process_*,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct kaiocb *job;
 	struct aioproc *aiop;
 	struct kaioinfo *ki;
 	struct proc *p;
 	struct vmspace *myvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = td->td_proc;
 	myvm = vmspace_acquire_ref(p);
 
 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
 	aiop->aioproc = p;
 	aiop->aioprocflags = 0;
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aioprocflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aioprocflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((job = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 
 			ki = job->userproc->p_aioinfo;
 			job->handle_fn(job);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&aio_job_mtx);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aioprocflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
 		    (aiop->aioprocflags & AIOP_FREE) &&
 		    num_aio_procs > target_aio_procs)
 			break;
 	}
 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
 	num_aio_procs--;
 	mtx_unlock(&aio_job_mtx);
 	uma_zfree(aiop_zone, aiop);
 	free_unr(aiod_unr, id);
 	vmspace_free(myvm);
 
 	KASSERT(p->p_vmspace == myvm,
 	    ("AIOD: bad vmspace for exiting daemon"));
 	KASSERT(myvm->vm_refcnt > 1,
 	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
 	kproc_exit(0);
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead physio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qphysio(struct proc *p, struct kaiocb *job)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct bio *bp;
 	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	int error, ref, poff;
 	vm_prot_t prot;
 
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VCHR)
 		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 
 	if ((csw->d_flags & D_DISK) == 0) {
 		error = -1;
 		goto unref;
 	}
 	if (cb->aio_nbytes > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
 	ki = p->p_aioinfo;
 	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
 	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
 		if (cb->aio_nbytes > MAXPHYS) {
 			error = -1;
 			goto unref;
 		}
 
 		pbuf = NULL;
 	} else {
 		if (cb->aio_nbytes > MAXPHYS - poff) {
 			error = -1;
 			goto unref;
 		}
 		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
 			error = -1;
 			goto unref;
 		}
 
 		job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
 		BUF_KERNPROC(pbuf);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count++;
 		AIO_UNLOCK(ki);
 	}
 	job->bp = bp = g_alloc_bio();
 
 	bp->bio_length = cb->aio_nbytes;
 	bp->bio_bcount = cb->aio_nbytes;
 	bp->bio_done = aio_physwakeup;
 	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
 	bp->bio_offset = cb->aio_offset;
 	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
 	bp->bio_dev = dev;
 	bp->bio_caller1 = (void *)job;
 
 	prot = VM_PROT_READ;
 	if (cb->aio_lio_opcode == LIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
 	    nitems(job->pages));
 	if (job->npages < 0) {
 		error = EFAULT;
 		goto doerror;
 	}
 	if (pbuf != NULL) {
 		pmap_qenter((vm_offset_t)pbuf->b_data,
 		    job->pages, job->npages);
 		bp->bio_data = pbuf->b_data + poff;
 		atomic_add_int(&num_buf_aio, 1);
 	} else {
 		bp->bio_ma = job->pages;
 		bp->bio_ma_n = job->npages;
 		bp->bio_ma_offset = poff;
 		bp->bio_data = unmapped_buf;
 		bp->bio_flags |= BIO_UNMAPPED;
 	}
 
 	/* Perform transfer. */
 	csw->d_strategy(bp);
 	dev_relthread(dev, ref);
 	return (0);
 
 doerror:
 	if (pbuf != NULL) {
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 		relpbuf(pbuf, NULL);
 		job->pbuf = NULL;
 	}
 	g_destroy_bio(bp);
 	job->bp = NULL;
 unref:
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	nsig->sigev_notify = osig->sigev_notify;
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb *ojob;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
 	if (error)
 		return (error);
 	ojob = (struct oaiocb *)kjob;
 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 
 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
 }
 
 static long
 aiocb_fetch_status(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.status));
 }
 
 static long
 aiocb_fetch_error(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.error));
 }
 
 static int
 aiocb_store_status(struct aiocb *ujob, long status)
 {
 
 	return (suword(&ujob->_aiocb_private.status, status));
 }
 
 static int
 aiocb_store_error(struct aiocb *ujob, long error)
 {
 
 	return (suword(&ujob->_aiocb_private.error, error));
 }
 
 static int
 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 
 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb_ops = {
 	.copyin = aiocb_copyin,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb_ops_osigevent = {
 	.copyin = aiocb_copyin_old_sigevent,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 #endif
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
 	int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	cap_rights_t rights;
 	struct file *fp;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 	u_short evflags;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	ops->store_status(ujob, -1);
 	ops->store_error(ujob, 0);
 	ops->store_kernelinfo(ujob, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= ki->kaio_qallowed_count) {
 		ops->store_error(ujob, EAGAIN);
 		return (EAGAIN);
 	}
 
 	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	knlist_init_mtx(&job->klist, AIO_MTX(ki));
 
 	error = ops->copyin(ujob, &job->uaiocb);
 	if (error) {
 		ops->store_error(ujob, error);
 		uma_zfree(aiocb_zone, job);
 		return (error);
 	}
 
 	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		ops->store_error(ujob, EINVAL);
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	ksiginfo_init(&job->ksi);
 
 	/* Save userspace address of the job info. */
 	job->ujob = ujob;
 
 	/* Get the opcode. */
 	if (type != LIO_NOP)
 		job->uaiocb.aio_lio_opcode = type;
 	opcode = job->uaiocb.aio_lio_opcode;
 
 	/*
 	 * Validate the opcode and fetch the file object for the specified
 	 * file descriptor.
 	 *
 	 * XXXRW: Moved the opcode validation up here so that we don't
 	 * retrieve a file descriptor without knowing what the capabiltity
 	 * should be.
 	 */
 	fd = job->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 		error = fget_write(td, fd,
 		    cap_rights_init(&rights, CAP_PWRITE), &fp);
 		break;
 	case LIO_READ:
 		error = fget_read(td, fd,
 		    cap_rights_init(&rights, CAP_PREAD), &fp);
 		break;
 	case LIO_SYNC:
 		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
 		break;
 	case LIO_MLOCK:
 		fp = NULL;
 		break;
 	case LIO_NOP:
 		error = fget(td, fd, cap_rights_init(&rights), &fp);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error) {
 		uma_zfree(aiocb_zone, job);
 		ops->store_error(ujob, error);
 		return (error);
 	}
 
 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if (opcode != LIO_SYNC && job->uaiocb.aio_offset == -1LL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	job->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	job->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = ops->store_kernelinfo(ujob, jid);
 	if (error) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, job);
 		return (0);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	kev.ident = (uintptr_t)job->ujob;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
 	kev.data = (intptr_t)job;
 	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, 1);
 	if (error)
 		goto aqueue_fail;
 
 no_kqueue:
 
 	ops->store_error(ujob, EINPROGRESS);
 	job->uaiocb._aiocb_private.error = EINPROGRESS;
 	job->userproc = p;
 	job->cred = crhold(td->td_ucred);
 	job->jobflags = KAIOCB_QUEUEING;
 	job->lio = lj;
 
 	if (opcode == LIO_MLOCK) {
 		aio_schedule(job, aio_process_mlock);
 		error = 0;
 	} else if (fp->f_ops->fo_aio_queue == NULL)
 		error = aio_queue_file(fp, job);
 	else
 		error = fo_aio_queue(fp, job);
 	if (error)
 		goto aqueue_fail;
 
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_QUEUEING;
 	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	atomic_add_int(&num_queue_count, 1);
 	if (job->jobflags & KAIOCB_FINISHED) {
 		/*
 		 * The queue callback completed the request synchronously.
 		 * The bulk of the completion is deferred in that case
 		 * until this point.
 		 */
 		aio_bio_done_notify(p, job);
 	} else
 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
 	AIO_UNLOCK(ki);
 	return (0);
 
 aqueue_fail:
 	knlist_delete(&job->klist, curthread, 0);
 	if (fp)
 		fdrop(fp, td);
 	uma_zfree(aiocb_zone, job);
 	ops->store_error(ujob, error);
 	return (error);
 }
 
 static void
 aio_cancel_daemon_job(struct kaiocb *job)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&aio_jobs, job, list);
 	mtx_unlock(&aio_job_mtx);
 	aio_cancel(job);
 }
 
 void
 aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
 		mtx_unlock(&aio_job_mtx);
 		aio_cancel(job);
 		return;
 	}
 	job->handle_fn = func;
 	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
 	aio_kick_nowait(job->userproc);
 	mtx_unlock(&aio_job_mtx);
 }
 
 static void
 aio_cancel_sync(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	ki = job->userproc->p_aioinfo;
-	mtx_lock(&aio_job_mtx);
+	AIO_LOCK(ki);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
-	mtx_unlock(&aio_job_mtx);
+	AIO_UNLOCK(ki);
 	aio_cancel(job);
 }
 
 int
 aio_queue_file(struct file *fp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *job2;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, opcode;
 	bool safe;
 
 	lj = job->lio;
 	ki = job->userproc->p_aioinfo;
 	opcode = job->uaiocb.aio_lio_opcode;
 	if (opcode == LIO_SYNC)
 		goto queueit;
 
 	if ((error = aio_qphysio(job->userproc, job)) == 0)
 		goto done;
 #if 0
 	/*
 	 * XXX: This means qphysio() failed with EFAULT.  The current
 	 * behavior is to retry the operation via fo_read/fo_write.
 	 * Wouldn't it be better to just complete the request with an
 	 * error here?
 	 */
 	if (error > 0)
 		goto done;
 #endif
 queueit:
 	safe = false;
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vp->v_type == VREG || vp->v_type == VDIR) {
 			mp = fp->f_vnode->v_mount;
 			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
 				safe = true;
 		}
 	}
 	if (!(safe || enable_aio_unsafe)) {
 		counted_warning(&unsafe_warningcnt,
 		    "is attempting to use unsafe AIO requests");
 		return (EOPNOTSUPP);
 	}
 
 	if (opcode == LIO_SYNC) {
 		AIO_LOCK(ki);
 		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
 			if (job2->fd_file == job->fd_file &&
 			    job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    job2->seqno < job->seqno) {
 				job2->jobflags |= KAIOCB_CHECKSYNC;
 				job->pending++;
 			}
 		}
 		if (job->pending != 0) {
-			if (!aio_set_cancel_function(job, aio_cancel_sync)) {
+			if (!aio_set_cancel_function_locked(job,
+				aio_cancel_sync)) {
 				AIO_UNLOCK(ki);
 				aio_cancel(job);
 				return (0);
 			}
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 		AIO_UNLOCK(ki);
 	}
 
 	switch (opcode) {
 	case LIO_READ:
 	case LIO_WRITE:
 		aio_schedule(job, aio_process_rw);
 		error = 0;
 		break;
 	case LIO_SYNC:
 		aio_schedule(job, aio_process_sync);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 	}
 done:
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start <
 	    ki->kaio_maxactive_count) {
 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start <
 	    ki->kaio_maxactive_count) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 static int
 kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	long status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
 		if (job->ujob == ujob)
 			break;
 	}
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 int
 sys_aio_return(struct thread *td, struct aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 static int
 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
     struct timespec *ts)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *firstjob, *job;
 	int error, i, timo;
 
 	timo = 0;
 	if (ts) {
 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	if (njoblist == 0)
 		return (0);
 
 	AIO_LOCK(ki);
 	for (;;) {
 		firstjob = NULL;
 		error = 0;
 		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (job->ujob == ujoblist[i]) {
 					if (firstjob == NULL)
 						firstjob = job;
 					if (job->jobflags & KAIOCB_FINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (firstjob == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	return (error);
 }
 
 int
 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	int error;
 
 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
 	if (error == 0)
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	uma_zfree(aiol_zone, ujoblist);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-physio aio operations not currently in
  * progress.
  */
 int
 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct kaiocb *job, *jobn;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp, &error)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		if ((uap->fd == job->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == job->ujob))) {
 			if (aio_cancel_job(p, ki, job)) {
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
  * aio_error is implemented in the kernel level for compatibility purposes
  * only.  For a user mode async implementation, it would be best to do it in
  * a userland subroutine.
  */
 static int
 kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 		if (job->ujob == ujob) {
 			if (job->jobflags & KAIOCB_FINISHED)
 				td->td_retval[0] =
 					job->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = ops->fetch_status(ujob);
 	if (status == -1) {
 		td->td_retval[0] = ops->fetch_error(ujob);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 int
 sys_aio_error(struct thread *td, struct aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
 }
 
 int
 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
 }
 
 static int
 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
     struct aiocb **acb_list, int nent, struct sigevent *sig,
     struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *job;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int error;
 	int nerror;
 	int i;
 
 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
 		return (EINVAL);
 
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (sig && (mode == LIO_NOWAIT)) {
 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uacb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nerror = 0;
 	for (i = 0; i < nent; i++) {
 		job = acb_list[i];
 		if (job != NULL) {
 			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
 			if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 			    == LIOJ_SIGNAL
 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal,
 					    &lj->lioj_ksi);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	return (error);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent osig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode,
 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 		    &aiocb_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 /* syscall - list directed I/O (REALTIME) */
 int
 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig, sizeof(sig));
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
 		    nent, sigp, &aiocb_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 static void
 aio_physwakeup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	size_t nbytes;
 	int error, nblks;
 
 	/* Release mapping into kernel space. */
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 	if (job->pbuf) {
 		pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
 		relpbuf(job->pbuf, NULL);
 		job->pbuf = NULL;
 		atomic_subtract_int(&num_buf_aio, 1);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 	}
 	vm_page_unhold_pages(job->pages, job->npages);
 
 	bp = job->bp;
 	job->bp = NULL;
 	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
 	error = 0;
 	if (bp->bio_flags & BIO_ERROR)
 		error = bp->bio_error;
 	nblks = btodb(nbytes);
 	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
 		job->outblock += nblks;
 	else
 		job->inblock += nblks;
 
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, nbytes, 0);
 
 	g_destroy_bio(bp);
 }
 
 /* syscall - wait for the next completion of an aio request */
 static int
 kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
     struct timespec *ts, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 	struct aiocb *ujob;
 	long error, status;
 	int timo;
 
 	ops->store_aiocb(ujobp, NULL);
 
 	if (ts == NULL) {
 		timo = 0;
 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
 		timo = -1;
 	} else {
 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	job = NULL;
 	AIO_LOCK(ki);
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		if (timo == -1) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		ujob = job->ujob;
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_aiocb(ujobp, ujob);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
 }
 
 static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
     struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 
 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		aio_init_aioinfo(p);
 	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
 }
 
 int
 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct kaiocb *job = (struct kaiocb *)kn->kn_sdata;
 
 	/*
 	 * The job pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_aio = job;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&job->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_aio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct kaiocb *job = kn->kn_ptr.p_aio;
 
 	kn->kn_data = job->uaiocb._aiocb_private.error;
 	if (!(job->jobflags & KAIOCB_FINISHED))
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_lio = lj;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_lio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = kn->kn_ptr.p_lio;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 struct __aiocb_private32 {
 	int32_t	status;
 	int32_t	error;
 	uint32_t kernelinfo;
 };
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb32 {
 	int	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 } oaiocb32_t;
 #endif
 
 typedef struct aiocb32 {
 	int32_t	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	int	__spare__[2];
 	uint32_t __spare2__;
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
 } aiocb32_t;
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	CP(*osig, *nsig, sigev_notify);
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb32 job32;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_old_sigevent32(&job32.aio_sigevent,
 	    &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct aiocb32 job32;
 	int error;
 
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
 }
 
 static long
 aiocb32_fetch_status(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.status));
 }
 
 static long
 aiocb32_fetch_error(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.error));
 }
 
 static int
 aiocb32_store_status(struct aiocb *ujob, long status)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.status, status));
 }
 
 static int
 aiocb32_store_error(struct aiocb *ujob, long error)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.error, error));
 }
 
 static int
 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword32(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb32_ops = {
 	.copyin = aiocb32_copyin,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb32_ops_osigevent = {
 	.copyin = aiocb32_copyin_old_sigevent,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 #endif
 
 int
 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 int
 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	uint32_t *ujoblist32;
 	int error, i;
 
 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	ujoblist32 = (uint32_t *)ujoblist;
 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
 	    sizeof(ujoblist32[0]));
 	if (error == 0) {
 		for (i = uap->nent; i > 0; i--)
 			ujoblist[i] = PTRIN(ujoblist32[i]);
 
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	}
 	uma_zfree(aiol_zone, ujoblist);
 	return (error);
 }
 
 int
 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_read(struct thread *td,
     struct freebsd6_freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_write(struct thread *td,
     struct freebsd6_freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_waitcomplete(struct thread *td,
     struct freebsd32_aio_waitcomplete_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_lio_listio(struct thread *td,
     struct freebsd6_freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent32 osig;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent32(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 int
 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct sigevent32 sig32;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig32, sizeof(sig32));
 		if (error)
 			return (error);
 		error = convert_sigevent32(&sig32, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 #endif
Index: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
===================================================================
--- user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig	(nonexistent)
@@ -1,28 +0,0 @@
-config INFINIBAND_SDP
-	tristate "Sockets Direct Protocol"
-	depends on INFINIBAND && INFINIBAND_IPOIB
-	---help---
-	  Support for Sockets Direct Protocol (SDP).  This provides
-          sockets semantics over InfiniBand via address family
-          AF_INET_SDP (address family 27).  You can also LD_PRELOAD the
-          libsdp library from <http://openib.org> to have standard
-          sockets applications use SDP.
-
-config INFINIBAND_SDP_DEBUG
-	bool "Sockets Direct Protocol debugging"
-	depends on INFINIBAND_SDP
-	---help---
-	  This option causes debugging code to be compiled into the
-	  SDP driver.  The output can be turned on via the debug_level
-	  module parameter  (which can also be set through sysfs after the
-	  driver is loaded).
-
-config INFINIBAND_SDP_DEBUG_DATA
-        bool "Sockets Direct Protocol data path debugging"
-        depends on INFINIBAND_SDP_DEBUG
-        ---help---
-          This option compiles debugging code into the data path
-          of the SDP driver.  The output can be turned on via the
-          data_debug_level module parameter; however, even with output
-          turned off, this debugging code will have some performance
-          impact.

Property changes on: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
___________________________________________________________________
Deleted: fbsd:nokeywords
## -1 +0,0 ##
-true
\ No newline at end of property
Index: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
===================================================================
--- user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Makefile	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Makefile	(nonexistent)
@@ -1,6 +0,0 @@
-EXTRA_CFLAGS += -Idrivers/infiniband/include
-EXTRA_CFLAGS += -ggdb
-
-obj-$(CONFIG_INFINIBAND_SDP) += ib_sdp.o
-
-ib_sdp-objs := sdp_main.o sdp_cma.o sdp_bcopy.o sdp_proc.o sdp_tx.o sdp_rx.o sdp_zcopy.o

Property changes on: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
___________________________________________________________________
Deleted: fbsd:nokeywords
## -1 +0,0 ##
-true
\ No newline at end of property
Index: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h	(revision 303517)
@@ -1,725 +1,726 @@
 #ifndef _SDP_H_
 #define _SDP_H_
 
 #define	LINUXKPI_PARAM_PREFIX ib_sdp_
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ofed.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/domain.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_cm.h>
 #include <rdma/sdp_socket.h>
 #include <rdma/ib_fmr_pool.h>
 
 #ifdef SDP_DEBUG
 #define	CONFIG_INFINIBAND_SDP_DEBUG
 #endif
 
 #include "sdp_dbg.h"
 
 #undef LIST_HEAD
 /* From sys/queue.h */
 #define LIST_HEAD(name, type)                                           \
 struct name {                                                           \
         struct type *lh_first;  /* first element */                     \
 }
 
 /* Interval between successive polls in the Tx routine when polling is used
    instead of interrupts (in per-core Tx rings) - should be power of 2 */
 #define SDP_TX_POLL_MODER	16
 #define SDP_TX_POLL_TIMEOUT	(HZ / 20)
 #define SDP_NAGLE_TIMEOUT (HZ / 10)
 
 #define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 5)
 #define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ)
 #define SDP_SRCAVAIL_PAYLOAD_LEN 1
 
 #define SDP_RESOLVE_TIMEOUT 1000
 #define SDP_ROUTE_TIMEOUT 1000
 #define SDP_RETRY_COUNT 5
 #define SDP_KEEPALIVE_TIME (120 * 60 * HZ)
 #define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */
 
 #define SDP_TX_SIZE 0x40
 #define SDP_RX_SIZE 0x40
 
 #define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64))
 #define SDP_FMR_POOL_SIZE	1024
 #define SDP_FMR_DIRTY_SIZE	( SDP_FMR_POOL_SIZE / 4 )
 
 #define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2))
 
 /* mb inlined data len - rest will be rx'ed into frags */
 #define SDP_HEAD_SIZE (sizeof(struct sdp_bsdh))
 
 /* limit tx payload len, if the sink supports bigger buffers than the source
  * can handle.
  * or rx fragment size (limited by sge->length size) */
 #define	SDP_MAX_PACKET	(1 << 16)
 #define SDP_MAX_PAYLOAD (SDP_MAX_PACKET - SDP_HEAD_SIZE)
 
 #define SDP_MAX_RECV_SGES (SDP_MAX_PACKET / MCLBYTES)
 #define SDP_MAX_SEND_SGES (SDP_MAX_PACKET / MCLBYTES) + 2
 
 #define SDP_NUM_WC 4
 
 #define SDP_DEF_ZCOPY_THRESH 64*1024
 #define SDP_MIN_ZCOPY_THRESH PAGE_SIZE
 #define SDP_MAX_ZCOPY_THRESH 1048576
 
 #define SDP_OP_RECV 0x800000000LL
 #define SDP_OP_SEND 0x400000000LL
 #define SDP_OP_RDMA 0x200000000LL
 #define SDP_OP_NOP  0x100000000LL
 
 /* how long (in jiffies) to block sender till tx completion*/
 #define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10)
 
 #define SDP_AUTO_CONF	0xffff
 #define AUTO_MOD_DELAY (HZ / 4)
 
 struct sdp_mb_cb {
 	__u32		seq;		/* Starting sequence number	*/
 	struct bzcopy_state      *bz;
 	struct rx_srcavail_state *rx_sa;
 	struct tx_srcavail_state *tx_sa;
 };
 
 #define	M_PUSH	M_PROTO1	/* Do a 'push'. */
 #define	M_URG	M_PROTO2	/* Mark as urgent (oob). */
 
 #define SDP_SKB_CB(__mb)      ((struct sdp_mb_cb *)&((__mb)->cb[0]))
 #define BZCOPY_STATE(mb)      (SDP_SKB_CB(mb)->bz)
 #define RX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->rx_sa)
 #define TX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->tx_sa)
 
 #ifndef MIN
 #define MIN(a, b) (a < b ? a : b)
 #endif
 
 #define ring_head(ring)   (atomic_read(&(ring).head))
 #define ring_tail(ring)   (atomic_read(&(ring).tail))
 #define ring_posted(ring) (ring_head(ring) - ring_tail(ring))
 
 #define rx_ring_posted(ssk) ring_posted(ssk->rx_ring)
 #ifdef SDP_ZCOPY
 #define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \
 	(ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0))
 #else
 #define tx_ring_posted(ssk) ring_posted(ssk->tx_ring)
 #endif
 
 extern int sdp_zcopy_thresh;
 extern int rcvbuf_initial_size;
 extern struct workqueue_struct *rx_comp_wq;
 extern struct ib_client sdp_client;
 
 enum sdp_mid {
 	SDP_MID_HELLO = 0x0,
 	SDP_MID_HELLO_ACK = 0x1,
 	SDP_MID_DISCONN = 0x2,
 	SDP_MID_ABORT = 0x3,
 	SDP_MID_SENDSM = 0x4,
 	SDP_MID_RDMARDCOMPL = 0x6,
 	SDP_MID_SRCAVAIL_CANCEL = 0x8,
 	SDP_MID_CHRCVBUF = 0xB,
 	SDP_MID_CHRCVBUF_ACK = 0xC,
 	SDP_MID_SINKAVAIL = 0xFD,
 	SDP_MID_SRCAVAIL = 0xFE,
 	SDP_MID_DATA = 0xFF,
 };
 
 enum sdp_flags {
         SDP_OOB_PRES = 1 << 0,
         SDP_OOB_PEND = 1 << 1,
 };
 
 enum {
 	SDP_MIN_TX_CREDITS = 2
 };
 
 enum {
 	SDP_ERR_ERROR   = -4,
 	SDP_ERR_FAULT   = -3,
 	SDP_NEW_SEG     = -2,
 	SDP_DO_WAIT_MEM = -1
 };
 
 struct sdp_bsdh {
 	u8 mid;
 	u8 flags;
 	__u16 bufs;
 	__u32 len;
 	__u32 mseq;
 	__u32 mseq_ack;
 } __attribute__((__packed__));
 
 union cma_ip_addr {
 	struct in6_addr ip6;
 	struct {
 		__u32 pad[3];
 		__u32 addr;
 	} ip4;
 } __attribute__((__packed__));
 
 /* TODO: too much? Can I avoid having the src/dst and port here? */
 struct sdp_hh {
 	struct sdp_bsdh bsdh;
 	u8 majv_minv;
 	u8 ipv_cap;
 	u8 rsvd1;
 	u8 max_adverts;
 	__u32 desremrcvsz;
 	__u32 localrcvsz;
 	__u16 port;
 	__u16 rsvd2;
 	union cma_ip_addr src_addr;
 	union cma_ip_addr dst_addr;
 	u8 rsvd3[IB_CM_REQ_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 48];
 } __attribute__((__packed__));
 
 struct sdp_hah {
 	struct sdp_bsdh bsdh;
 	u8 majv_minv;
 	u8 ipv_cap;
 	u8 rsvd1;
 	u8 ext_max_adverts;
 	__u32 actrcvsz;
 	u8 rsvd2[IB_CM_REP_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 8];
 } __attribute__((__packed__));
 
 struct sdp_rrch {
 	__u32 len;
 } __attribute__((__packed__));
 
 struct sdp_srcah {
 	__u32 len;
 	__u32 rkey;
 	__u64 vaddr;
 } __attribute__((__packed__));
 
 struct sdp_buf {
         struct mbuf *mb;
         u64             mapping[SDP_MAX_SEND_SGES];
 } __attribute__((__packed__));
 
 struct sdp_chrecvbuf {
 	u32 size;
 } __attribute__((__packed__));
 
 /* Context used for synchronous zero copy bcopy (BZCOPY) */
 struct bzcopy_state {
 	unsigned char __user  *u_base;
 	int                    u_len;
 	int                    left;
 	int                    page_cnt;
 	int                    cur_page;
 	int                    cur_offset;
 	int                    busy;
 	struct sdp_sock      *ssk;
 	struct page         **pages;
 };
 
 enum rx_sa_flag {
 	RX_SA_ABORTED    = 2,
 };
 
 enum tx_sa_flag {
 	TX_SA_SENDSM     = 0x01,
 	TX_SA_CROSS_SEND = 0x02,
 	TX_SA_INTRRUPTED = 0x04,
 	TX_SA_TIMEDOUT   = 0x08,
 	TX_SA_ERROR      = 0x10,
 };
 
 struct rx_srcavail_state {
 	/* Advertised buffer stuff */
 	u32 mseq;
 	u32 used;
 	u32 reported;
 	u32 len;
 	u32 rkey;
 	u64 vaddr;
 
 	/* Dest buff info */
 	struct ib_umem *umem;
 	struct ib_pool_fmr *fmr;
 
 	/* Utility */
 	u8  busy;
 	enum rx_sa_flag  flags;
 };
 
 struct tx_srcavail_state {
 	/* Data below 'busy' will be reset */
 	u8		busy;
 
 	struct ib_umem *umem;
 	struct ib_pool_fmr *fmr;
 
 	u32		bytes_sent;
 	u32		bytes_acked;
 
 	enum tx_sa_flag	abort_flags;
 	u8		posted;
 
 	u32		mseq;
 };
 
 struct sdp_tx_ring {
 #ifdef SDP_ZCOPY
 	struct rx_srcavail_state *rdma_inflight;
 #endif
 	struct sdp_buf   	*buffer;
 	atomic_t          	head;
 	atomic_t          	tail;
 	struct ib_cq 	 	*cq;
 
 	atomic_t 	  	credits;
 #define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits))
 
 	struct callout		timer;
 	u16 		  	poll_cnt;
 };
 
 struct sdp_rx_ring {
 	struct sdp_buf   *buffer;
 	atomic_t          head;
 	atomic_t          tail;
 	struct ib_cq 	 *cq;
 
 	int		 destroyed;
 	struct rwlock	 destroyed_lock;
 };
 
 struct sdp_device {
 	struct ib_pd 		*pd;
 	struct ib_mr 		*mr;
 	struct ib_fmr_pool 	*fmr_pool;
 };
 
 struct sdp_moderation {
 	unsigned long last_moder_packets;
 	unsigned long last_moder_tx_packets;
 	unsigned long last_moder_bytes;
 	unsigned long last_moder_jiffies;
 	int last_moder_time;
 	u16 rx_usecs;
 	u16 rx_frames;
 	u16 tx_usecs;
 	u32 pkt_rate_low;
 	u16 rx_usecs_low;
 	u32 pkt_rate_high;
 	u16 rx_usecs_high;
 	u16 sample_interval;
 	u16 adaptive_rx_coal;
 	u32 msg_enable;
 
 	int moder_cnt;
 	int moder_time;
 };
 
 /* These are flags fields. */
 #define	SDP_TIMEWAIT	0x0001		/* In ssk timewait state. */
 #define	SDP_DROPPED	0x0002		/* Socket has been dropped. */
 #define	SDP_SOCKREF	0x0004		/* Holding a sockref for close. */
 #define	SDP_NODELAY	0x0008		/* Disble nagle. */
 #define	SDP_NEEDFIN	0x0010		/* Send a fin on the next tx. */
 #define	SDP_DREQWAIT	0x0020		/* Waiting on DREQ. */
 #define	SDP_DESTROY	0x0040		/* Being destroyed. */
 #define	SDP_DISCON	0x0080		/* rdma_disconnect is owed. */
 
 /* These are oobflags */
 #define	SDP_HADOOB	0x0001		/* Had OOB data. */
 #define	SDP_HAVEOOB	0x0002		/* Have OOB data. */
 
 struct sdp_sock {
 	LIST_ENTRY(sdp_sock) list;
 	struct socket *socket;
 	struct rdma_cm_id *id;
 	struct ib_device *ib_device;
 	struct sdp_device *sdp_dev;
 	struct ib_qp *qp;
 	struct ucred *cred;
 	struct callout keep2msl;	/* 2msl and keepalive timer. */
 	struct callout nagle_timer;	/* timeout waiting for ack */
 	struct ib_ucontext context;
 	in_port_t lport;
 	in_addr_t laddr;
 	in_port_t fport;
 	in_addr_t faddr;
 	int flags;
 	int oobflags;		/* protected by rx lock. */
 	int state;
 	int softerror;
 	int recv_bytes;		/* Bytes per recv. buf including header */
 	int xmit_size_goal;
 	char iobc;
 
 	struct sdp_rx_ring rx_ring;
 	struct sdp_tx_ring tx_ring;
 	struct rwlock	lock;
-	struct mbuf *rx_ctl_q;
-	struct mbuf *rx_ctl_tail;
+	struct mbufq	rxctlq;		/* received control packets */
 
 	int qp_active;	/* XXX Flag. */
 	int max_sge;
 	struct work_struct rx_comp_work;
 #define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt))
 	atomic_t rcv_nxt;
 
 	/* SDP specific */
 	atomic_t mseq_ack;
 #define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack))
 	unsigned max_bufs;	/* Initial buffers offered by other side */
 	unsigned min_bufs;	/* Low water mark to wake senders */
 
 	unsigned long nagle_last_unacked; /* mseq of lastest unacked packet */
 
 	atomic_t               remote_credits;
 #define remote_credits(ssk) (atomic_read(&ssk->remote_credits))
 	int 		  poll_cq;
 
 	/* SDP slow start */
 	int recv_request_head; 	/* mark the rx_head when the resize request
 				   was received */
 	int recv_request; 	/* XXX flag if request to resize was received */
 
 	unsigned long tx_packets;
 	unsigned long rx_packets;
 	unsigned long tx_bytes;
 	unsigned long rx_bytes;
 	struct sdp_moderation auto_mod;
 	struct task shutdown_task;
 #ifdef SDP_ZCOPY
 	struct tx_srcavail_state *tx_sa;
 	struct rx_srcavail_state *rx_sa;
 	spinlock_t tx_sa_lock;
 	struct delayed_work srcavail_cancel_work;
 	int srcavail_cancel_mseq;
 	/* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */
 	int zcopy_thresh;
 #endif
 };
 
 #define	sdp_sk(so)	((struct sdp_sock *)(so->so_pcb))
 
 #define	SDP_RLOCK(ssk)		rw_rlock(&(ssk)->lock)
 #define	SDP_WLOCK(ssk)		rw_wlock(&(ssk)->lock)
 #define	SDP_RUNLOCK(ssk)	rw_runlock(&(ssk)->lock)
 #define	SDP_WUNLOCK(ssk)	rw_wunlock(&(ssk)->lock)
 #define	SDP_WLOCK_ASSERT(ssk)	rw_assert(&(ssk)->lock, RA_WLOCKED)
 #define	SDP_RLOCK_ASSERT(ssk)	rw_assert(&(ssk)->lock, RA_RLOCKED)
 #define	SDP_LOCK_ASSERT(ssk)	rw_assert(&(ssk)->lock, RA_LOCKED)
+
+MALLOC_DECLARE(M_SDP);
 
 static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa)
 {
 	memset((void *)&tx_sa->busy, 0,
 			sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy));
 }
 
 static inline void rx_ring_unlock(struct sdp_rx_ring *rx_ring)
 {
 	rw_runlock(&rx_ring->destroyed_lock);
 }
 
 static inline int rx_ring_trylock(struct sdp_rx_ring *rx_ring)
 {
 	rw_rlock(&rx_ring->destroyed_lock);
 	if (rx_ring->destroyed) {
 		rx_ring_unlock(rx_ring);
 		return 0;
 	}
 	return 1;
 }
 
 static inline void rx_ring_destroy_lock(struct sdp_rx_ring *rx_ring)
 {
 	rw_wlock(&rx_ring->destroyed_lock);
 	rx_ring->destroyed = 1;
 	rw_wunlock(&rx_ring->destroyed_lock);
 }
 
 static inline void sdp_arm_rx_cq(struct sdp_sock *ssk)
 {
 	sdp_prf(ssk->socket, NULL, "Arming RX cq");
 	sdp_dbg_data(ssk->socket, "Arming RX cq\n");
 
 	ib_req_notify_cq(ssk->rx_ring.cq, IB_CQ_NEXT_COMP);
 }
 
 static inline void sdp_arm_tx_cq(struct sdp_sock *ssk)
 {
 	sdp_prf(ssk->socket, NULL, "Arming TX cq");
 	sdp_dbg_data(ssk->socket, "Arming TX cq. credits: %d, posted: %d\n",
 		tx_credits(ssk), tx_ring_posted(ssk));
 
 	ib_req_notify_cq(ssk->tx_ring.cq, IB_CQ_NEXT_COMP);
 }
 
 /* return the min of:
  * - tx credits
  * - free slots in tx_ring (not including SDP_MIN_TX_CREDITS
  */
 static inline int tx_slots_free(struct sdp_sock *ssk)
 {
 	int min_free;
 
 	min_free = MIN(tx_credits(ssk),
 			SDP_TX_SIZE - tx_ring_posted(ssk));
 	if (min_free < SDP_MIN_TX_CREDITS)
 		return 0;
 
 	return min_free - SDP_MIN_TX_CREDITS;
 };
 
 /* utilities */
 static inline char *mid2str(int mid)
 {
 #define ENUM2STR(e) [e] = #e
 	static char *mid2str[] = {
 		ENUM2STR(SDP_MID_HELLO),
 		ENUM2STR(SDP_MID_HELLO_ACK),
 		ENUM2STR(SDP_MID_ABORT),
 		ENUM2STR(SDP_MID_DISCONN),
 		ENUM2STR(SDP_MID_SENDSM),
 		ENUM2STR(SDP_MID_RDMARDCOMPL),
 		ENUM2STR(SDP_MID_SRCAVAIL_CANCEL),
 		ENUM2STR(SDP_MID_CHRCVBUF),
 		ENUM2STR(SDP_MID_CHRCVBUF_ACK),
 		ENUM2STR(SDP_MID_DATA),
 		ENUM2STR(SDP_MID_SRCAVAIL),
 		ENUM2STR(SDP_MID_SINKAVAIL),
 	};
 
 	if (mid >= ARRAY_SIZE(mid2str))
 		return NULL;
 
 	return mid2str[mid];
 }
 
 static inline struct mbuf *
 sdp_alloc_mb(struct socket *sk, u8 mid, int size, int wait)
 {
 	struct sdp_bsdh *h;
 	struct mbuf *mb;
 
 	MGETHDR(mb, wait, MT_DATA);
 	if (mb == NULL)
 		return (NULL);
 	mb->m_pkthdr.len = mb->m_len = sizeof(struct sdp_bsdh);
 	h = mtod(mb, struct sdp_bsdh *);
 	h->mid = mid;
 
 	return mb;
 }
 static inline struct mbuf *
 sdp_alloc_mb_data(struct socket *sk, int wait)
 {
 	return sdp_alloc_mb(sk, SDP_MID_DATA, 0, wait);
 }
 
 static inline struct mbuf *
 sdp_alloc_mb_disconnect(struct socket *sk, int wait)
 {
 	return sdp_alloc_mb(sk, SDP_MID_DISCONN, 0, wait);
 }
 
 static inline void *
 mb_put(struct mbuf *mb, int len)
 {
 	uint8_t *data;
 
 	data = mb->m_data;
 	data += mb->m_len;
 	mb->m_len += len;
 	return (void *)data;
 }
 
 static inline struct mbuf *
 sdp_alloc_mb_chrcvbuf_ack(struct socket *sk, int size, int wait)
 {
 	struct mbuf *mb;
 	struct sdp_chrecvbuf *resp_size;
 
 	mb = sdp_alloc_mb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), wait);
 	if (mb == NULL)
 		return (NULL);
 	resp_size = (struct sdp_chrecvbuf *)mb_put(mb, sizeof *resp_size);
 	resp_size->size = htonl(size);
 
 	return mb;
 }
 
 static inline struct mbuf *
 sdp_alloc_mb_srcavail(struct socket *sk, u32 len, u32 rkey, u64 vaddr, int wait)
 {
 	struct mbuf *mb;
 	struct sdp_srcah *srcah;
 
 	mb = sdp_alloc_mb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), wait);
 	if (mb == NULL)
 		return (NULL);
 	srcah = (struct sdp_srcah *)mb_put(mb, sizeof(*srcah));
 	srcah->len = htonl(len);
 	srcah->rkey = htonl(rkey);
 	srcah->vaddr = cpu_to_be64(vaddr);
 
 	return mb;
 }
 
 static inline struct mbuf *
 sdp_alloc_mb_srcavail_cancel(struct socket *sk, int wait)
 {
 	return sdp_alloc_mb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, wait);
 }
 
 static inline struct mbuf *
 sdp_alloc_mb_rdmardcompl(struct socket *sk, u32 len, int wait)
 {
 	struct mbuf *mb;
 	struct sdp_rrch *rrch;
 
 	mb = sdp_alloc_mb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), wait);
 	if (mb == NULL)
 		return (NULL);
 	rrch = (struct sdp_rrch *)mb_put(mb, sizeof(*rrch));
 	rrch->len = htonl(len);
 
 	return mb;
 }
 
 static inline struct mbuf *
 sdp_alloc_mb_sendsm(struct socket *sk, int wait)
 {
 	return sdp_alloc_mb(sk, SDP_MID_SENDSM, 0, wait);
 }
 static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk)
 {
 	return SDP_TX_SIZE - tx_ring_posted(ssk);
 }
 
 static inline int credit_update_needed(struct sdp_sock *ssk)
 {
 	int c;
 
 	c = remote_credits(ssk);
 	if (likely(c > SDP_MIN_TX_CREDITS))
 		c += c/2;
 	return unlikely(c < rx_ring_posted(ssk)) &&
 	    likely(tx_credits(ssk) > 0) &&
 	    likely(sdp_tx_ring_slots_left(ssk));
 }
 
 
 #define SDPSTATS_COUNTER_INC(stat)
 #define SDPSTATS_COUNTER_ADD(stat, val)
 #define SDPSTATS_COUNTER_MID_INC(stat, mid)
 #define SDPSTATS_HIST_LINEAR(stat, size)
 #define SDPSTATS_HIST(stat, size)
 
 static inline void
 sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf,
     enum dma_data_direction dir)
 {
 	struct ib_device *dev;
 	struct mbuf *mb;
 	int i;
 
 	dev = ssk->ib_device;
 	for (i = 0, mb = sbuf->mb; mb != NULL; mb = mb->m_next, i++)
 		ib_dma_unmap_single(dev, sbuf->mapping[i], mb->m_len, dir);
 }
 
 /* sdp_main.c */
 void sdp_set_default_moderation(struct sdp_sock *ssk);
 void sdp_start_keepalive_timer(struct socket *sk);
 void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb);
 void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk);
 void sdp_abort(struct socket *sk);
 struct sdp_sock *sdp_notify(struct sdp_sock *ssk, int error);
 
 
 /* sdp_cma.c */
 int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
 
 /* sdp_tx.c */
 int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
 void sdp_tx_ring_destroy(struct sdp_sock *ssk);
 int sdp_xmit_poll(struct sdp_sock *ssk, int force);
 void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb);
 void sdp_post_sends(struct sdp_sock *ssk, int wait);
 void sdp_post_keepalive(struct sdp_sock *ssk);
 
 /* sdp_rx.c */
 void sdp_rx_ring_init(struct sdp_sock *ssk);
 int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
 void sdp_rx_ring_destroy(struct sdp_sock *ssk);
 int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size);
 int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size);
 void sdp_do_posts(struct sdp_sock *ssk);
 void sdp_rx_comp_full(struct sdp_sock *ssk);
 
 /* sdp_zcopy.c */
 struct kiocb;
 int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov);
 int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah);
 void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack);
 void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
 		u32 bytes_completed);
 int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk);
 int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
 		unsigned long *used);
 int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
 		struct rx_srcavail_state *rx_sa);
 int sdp_post_sendsm(struct socket *sk);
 void srcavail_cancel_timeout(struct work_struct *work);
 void sdp_abort_srcavail(struct socket *sk);
 void sdp_abort_rdma_read(struct socket *sk);
 int sdp_process_rx(struct sdp_sock *ssk);
 
 #endif
Index: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c	(revision 303517)
@@ -1,1967 +1,1975 @@
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *      The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
  * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
  */
 
 /*
  *
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+
 #include "sdp.h"
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <sys/sysctl.h>
 
 uma_zone_t	sdp_zone;
 struct rwlock	sdp_lock;
 LIST_HEAD(, sdp_sock) sdp_list;
 
 struct workqueue_struct *rx_comp_wq;
 
 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
 #define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
 #define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
 #define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
 #define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
 #define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
 
-static MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
+MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
 
 static void sdp_stop_keepalive_timer(struct socket *so);
 
 /*
  * SDP protocol interface to socket abstraction.
  */
 /*
  * sdp_sendspace and sdp_recvspace are the default send and receive window
  * sizes, respectively.
  */
 u_long	sdp_sendspace = 1024*32;
 u_long	sdp_recvspace = 1024*64;
 
 static int sdp_count;
 
 /*
  * Disable async. CMA events for sockets which are being torn down.
  */
 static void
 sdp_destroy_cma(struct sdp_sock *ssk)
 {
 
 	if (ssk->id == NULL)
 		return;
 	rdma_destroy_id(ssk->id);
 	ssk->id = NULL;
 }
 
 static int
 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
 {
 	struct sockaddr_in *sin;
 	struct sockaddr_in null;
 	int error;
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
 		return (EINVAL);
 	/* rdma_bind_addr handles bind races.  */
 	SDP_WUNLOCK(ssk);
 	if (ssk->id == NULL)
 		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
 	if (ssk->id == NULL) {
 		SDP_WLOCK(ssk);
 		return (ENOMEM);
 	}
 	if (nam == NULL) {
 		null.sin_family = AF_INET;
 		null.sin_len = sizeof(null);
 		null.sin_addr.s_addr = INADDR_ANY;
 		null.sin_port = 0;
 		bzero(&null.sin_zero, sizeof(null.sin_zero));
 		nam = (struct sockaddr *)&null;
 	}
 	error = -rdma_bind_addr(ssk->id, nam);
 	SDP_WLOCK(ssk);
 	if (error == 0) {
 		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
 		ssk->laddr = sin->sin_addr.s_addr;
 		ssk->lport = sin->sin_port;
 	} else
 		sdp_destroy_cma(ssk);
 	return (error);
 }
 
 static void
 sdp_pcbfree(struct sdp_sock *ssk)
 {
+
 	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
+	KASSERT((ssk->flags & SDP_DESTROY) == 0,
+	    ("ssk %p already destroyed", ssk));
 
 	sdp_dbg(ssk->socket, "Freeing pcb");
 	SDP_WLOCK_ASSERT(ssk);
 	ssk->flags |= SDP_DESTROY;
 	SDP_WUNLOCK(ssk);
 	SDP_LIST_WLOCK();
 	sdp_count--;
 	LIST_REMOVE(ssk, list);
 	SDP_LIST_WUNLOCK();
 	crfree(ssk->cred);
-	sdp_destroy_cma(ssk);
 	ssk->qp_active = 0;
 	if (ssk->qp) {
 		ib_destroy_qp(ssk->qp);
 		ssk->qp = NULL;
 	}
 	sdp_tx_ring_destroy(ssk);
 	sdp_rx_ring_destroy(ssk);
+	sdp_destroy_cma(ssk);
 	rw_destroy(&ssk->rx_ring.destroyed_lock);
-	uma_zfree(sdp_zone, ssk);
 	rw_destroy(&ssk->lock);
+	uma_zfree(sdp_zone, ssk);
 }
 
 /*
  * Common routines to return a socket address.
  */
 static struct sockaddr *
 sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 static int
 sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct sdp_sock *ssk;
 	struct in_addr addr;
 	in_port_t port;
 
 	ssk = sdp_sk(so);
 	SDP_RLOCK(ssk);
 	port = ssk->lport;
 	addr.s_addr = ssk->laddr;
 	SDP_RUNLOCK(ssk);
 
 	*nam = sdp_sockaddr(port, &addr);
 	return 0;
 }
 
 static int
 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct sdp_sock *ssk;
 	struct in_addr addr;
 	in_port_t port;
 
 	ssk = sdp_sk(so);
 	SDP_RLOCK(ssk);
 	port = ssk->fport;
 	addr.s_addr = ssk->faddr;
 	SDP_RUNLOCK(ssk);
 
 	*nam = sdp_sockaddr(port, &addr);
 	return 0;
 }
 
 static void
 sdp_pcbnotifyall(struct in_addr faddr, int errno,
     struct sdp_sock *(*notify)(struct sdp_sock *, int))
 {
 	struct sdp_sock *ssk, *ssk_temp;
 
 	SDP_LIST_WLOCK();
 	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
 		SDP_WLOCK(ssk);
 		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
 			SDP_WUNLOCK(ssk);
 			continue;
 		}
 		if ((ssk->flags & SDP_DESTROY) == 0)
 			if ((*notify)(ssk, errno))
 				SDP_WUNLOCK(ssk);
 	}
 	SDP_LIST_WUNLOCK();
 }
 
 #if 0
 static void
 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
 {
 	struct sdp_sock *ssk;
 
 	SDP_LIST_RLOCK();
 	LIST_FOREACH(ssk, &sdp_list, list) {
 		SDP_WLOCK(ssk);
 		func(ssk, arg);
 		SDP_WUNLOCK(ssk);
 	}
 	SDP_LIST_RUNLOCK();
 }
 #endif
 
 static void
 sdp_output_reset(struct sdp_sock *ssk)
 {
 	struct rdma_cm_id *id;
 
 	SDP_WLOCK_ASSERT(ssk);
 	if (ssk->id) {
 		id = ssk->id;
 		ssk->qp_active = 0;
 		SDP_WUNLOCK(ssk);
 		rdma_disconnect(id);
 		SDP_WLOCK(ssk);
 	}
 	ssk->state = TCPS_CLOSED;
 }
 
 /*
  * Attempt to close a SDP socket, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 static struct sdp_sock *
 sdp_closed(struct sdp_sock *ssk)
 {
 	struct socket *so;
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	ssk->flags |= SDP_DROPPED;
 	so = ssk->socket;
 	soisdisconnected(so);
 	if (ssk->flags & SDP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("sdp_closed: !SS_PROTOREF"));
 		ssk->flags &= ~SDP_SOCKREF;
 		SDP_WUNLOCK(ssk);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (ssk);
 }
 
 /*
  * Perform timer based shutdowns which can not operate in
  * callout context.
  */
 static void
 sdp_shutdown_task(void *data, int pending)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	SDP_WLOCK(ssk);
 	/*
 	 * I don't think this can race with another call to pcbfree()
 	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
 	 */
 	if (ssk->flags & SDP_DESTROY)
 		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
 		    ssk);
 	if (ssk->flags & SDP_DISCON)
 		sdp_output_reset(ssk);
 	/* We have to clear this so sdp_detach() will call pcbfree(). */
 	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
 	if ((ssk->flags & SDP_DROPPED) == 0 &&
 	    sdp_closed(ssk) == NULL)
 		return;
 	if (ssk->socket == NULL) {
 		sdp_pcbfree(ssk);
 		return;
 	}
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * 2msl has expired, schedule the shutdown task.
  */
 static void
 sdp_2msl_timeout(void *data)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	/* Callout canceled. */
         if (!callout_active(&ssk->keep2msl))
 		goto out;
         callout_deactivate(&ssk->keep2msl);
 	/* Should be impossible, defensive programming. */
 	if ((ssk->flags & SDP_TIMEWAIT) == 0)
 		goto out;
 	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
 out:
 	SDP_WUNLOCK(ssk);
 	return;
 }
 
 /*
  * Schedule the 2msl wait timer.
  */
 static void
 sdp_2msl_wait(struct sdp_sock *ssk)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 	ssk->flags |= SDP_TIMEWAIT;
 	ssk->state = TCPS_TIME_WAIT;
 	soisdisconnected(ssk->socket);
 	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
 }
 
 /*
  * Timed out waiting for the final fin/ack from rdma_disconnect().
  */
 static void
 sdp_dreq_timeout(void *data)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	/* Callout canceled. */
         if (!callout_active(&ssk->keep2msl))
 		goto out;
 	/* Callout rescheduled, probably as a different timer. */
 	if (callout_pending(&ssk->keep2msl))
 		goto out;
         callout_deactivate(&ssk->keep2msl);
 	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
 		goto out;
 	if ((ssk->flags & SDP_DREQWAIT) == 0)
 		goto out;
 	ssk->flags &= ~SDP_DREQWAIT;
 	ssk->flags |= SDP_DISCON;
 	sdp_2msl_wait(ssk);
 	ssk->qp_active = 0;
 out:
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * Received the final fin/ack.  Cancel the 2msl.
  */
 void
 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
 {
 	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
 	ssk->flags &= ~SDP_DREQWAIT;
 	sdp_2msl_wait(ssk);
 }
 
 static int
 sdp_init_sock(struct socket *sk)
 {
 	struct sdp_sock *ssk = sdp_sk(sk);
 
 	sdp_dbg(sk, "%s\n", __func__);
 
 	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
 	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
 #ifdef SDP_ZCOPY
 	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
 	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
 	ssk->tx_ring.rdma_inflight = NULL;
 #endif
 	atomic_set(&ssk->mseq_ack, 0);
 	sdp_rx_ring_init(ssk);
 	ssk->tx_ring.buffer = NULL;
 
 	return 0;
 }
 
 /*
  * Allocate an sdp_sock for the socket and reserve socket buffer space.
  */
 static int
 sdp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct sdp_sock *ssk;
 	int error;
 
 	ssk = sdp_sk(so);
 	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = soreserve(so, sdp_sendspace, sdp_recvspace);
 		if (error)
 			return (error);
 	}
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
 	if (ssk == NULL)
 		return (ENOBUFS);
 	rw_init(&ssk->lock, "sdpsock");
 	ssk->socket = so;
 	ssk->cred = crhold(so->so_cred);
 	so->so_pcb = (caddr_t)ssk;
 	sdp_init_sock(so);
 	ssk->flags = 0;
 	ssk->qp_active = 0;
 	ssk->state = TCPS_CLOSED;
+	mbufq_init(&ssk->rxctlq, INT_MAX);
 	SDP_LIST_WLOCK();
 	LIST_INSERT_HEAD(&sdp_list, ssk, list);
 	sdp_count++;
 	SDP_LIST_WUNLOCK();
 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
 		so->so_linger = TCP_LINGERTIME;
 
 	return (0);
 }
 
 /*
  * Detach SDP from the socket, potentially leaving it around for the
  * timewait to expire.
  */
 static void
 sdp_detach(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
 	ssk->socket->so_pcb = NULL;
 	ssk->socket = NULL;
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
 		SDP_WUNLOCK(ssk);
 	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
 		sdp_pcbfree(ssk);
 	else
 		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
 }
 
 /*
  * Allocate a local address for the socket.
  */
 static int
 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 	struct sockaddr_in *sin;
 
 	sin = (struct sockaddr_in *)nam;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EINVAL);
 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	error = sdp_pcbbind(ssk, nam, td->td_ucred);
 out:
 	SDP_WUNLOCK(ssk);
 
 	return (error);
 }
 
 /*
  * Prepare to accept connections.
  */
 static int
 sdp_listen(struct socket *so, int backlog, struct thread *td)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	if (error == 0 && ssk->lport == 0)
 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
 	SOCK_LOCK(so);
 	if (error == 0)
 		error = solisten_proto_check(so);
 	if (error == 0) {
 		solisten_proto(so, backlog);
 		ssk->state = TCPS_LISTEN;
 	}
 	SOCK_UNLOCK(so);
 
 out:
 	SDP_WUNLOCK(ssk);
 	if (error == 0)
 		error = -rdma_listen(ssk->id, backlog);
 	return (error);
 }
 
 /*
  * Initiate a SDP connection to nam.
  */
 static int
 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in src;
 	struct socket *so;
 	int error;
 
 	so = ssk->socket;
 
 	SDP_WLOCK_ASSERT(ssk);
 	if (ssk->lport == 0) {
 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			return error;
 	}
 	src.sin_family = AF_INET;
 	src.sin_len = sizeof(src);
 	bzero(&src.sin_zero, sizeof(src.sin_zero));
 	src.sin_port = ssk->lport;
 	src.sin_addr.s_addr = ssk->laddr;
 	soisconnecting(so);
 	SDP_WUNLOCK(ssk);
 	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
 	    SDP_RESOLVE_TIMEOUT);
 	SDP_WLOCK(ssk);
 	if (error == 0)
 		ssk->state = TCPS_SYN_SENT;
 
 	return 0;
 }
 
 /*
  * Initiate SDP connection.
  */
 static int
 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 	struct sockaddr_in *sin;
 
 	sin = (struct sockaddr_in *)nam;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EINVAL);
 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
 		return (error);
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
 		error = EINVAL;
 	else
 		error = sdp_start_connect(ssk, nam, td);
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 /*
  * Drop a SDP socket, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 static struct sdp_sock *
 sdp_drop(struct sdp_sock *ssk, int errno)
 {
 	struct socket *so;
 
 	SDP_WLOCK_ASSERT(ssk);
 	so = ssk->socket;
 	if (TCPS_HAVERCVDSYN(ssk->state))
 		sdp_output_reset(ssk);
 	if (errno == ETIMEDOUT && ssk->softerror)
 		errno = ssk->softerror;
 	so->so_error = errno;
 	return (sdp_closed(ssk));
 }
 
 /*
  * User issued close, and wish to trail through shutdown states:
  * if never received SYN, just forget it.  If got a SYN from peer,
  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
  * If already got a FIN from peer, then almost done; go to LAST_ACK
  * state.  In all other cases, have already sent FIN to peer (e.g.
  * after PRU_SHUTDOWN), and just have to play tedious game waiting
  * for peer to send FIN or not respond to keep-alives, etc.
  * We can let the user exit from the close as soon as the FIN is acked.
  */
 static void
 sdp_usrclosed(struct sdp_sock *ssk)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	switch (ssk->state) {
 	case TCPS_LISTEN:
 		ssk->state = TCPS_CLOSED;
 		SDP_WUNLOCK(ssk);
 		sdp_destroy_cma(ssk);
 		SDP_WLOCK(ssk);
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		ssk = sdp_closed(ssk);
 		/*
 		 * sdp_closed() should never return NULL here as the socket is
 		 * still open.
 		 */
 		KASSERT(ssk != NULL,
 		    ("sdp_usrclosed: sdp_closed() returned NULL"));
 		break;
 
 	case TCPS_SYN_SENT:
 		/* FALLTHROUGH */
 	case TCPS_SYN_RECEIVED:
 		ssk->flags |= SDP_NEEDFIN;
 		break;
 
 	case TCPS_ESTABLISHED:
 		ssk->flags |= SDP_NEEDFIN;
 		ssk->state = TCPS_FIN_WAIT_1;
 		break;
 
 	case TCPS_CLOSE_WAIT:
 		ssk->state = TCPS_LAST_ACK;
 		break;
 	}
 	if (ssk->state >= TCPS_FIN_WAIT_2) {
 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
 		if (ssk->state == TCPS_FIN_WAIT_2)
 			sdp_2msl_wait(ssk);
 		else
 			soisdisconnected(ssk->socket);
 	}
 }
 
 static void
 sdp_output_disconnect(struct sdp_sock *ssk)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
 	    sdp_dreq_timeout, ssk);
 	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
 	sdp_post_sends(ssk, M_NOWAIT);
 }
 
 /*
  * Initiate or continue a disconnect.
  * If embryonic state, just send reset (once).
  * If in ``let data drain'' option and linger null, just drop.
  * Otherwise (hard), mark socket disconnecting and drop
  * current input data; switch states based on user close, and
  * send segment to peer (with FIN).
  */
 static void
 sdp_start_disconnect(struct sdp_sock *ssk)
 {
 	struct socket *so;
 	int unread;
 
 	so = ssk->socket;
 	SDP_WLOCK_ASSERT(ssk);
 	sdp_stop_keepalive_timer(so);
 	/*
 	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
 	 * socket is still open.
 	 */
 	if (ssk->state < TCPS_ESTABLISHED) {
 		ssk = sdp_closed(ssk);
 		KASSERT(ssk != NULL,
 		    ("sdp_start_disconnect: sdp_close() returned NULL"));
 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
 		ssk = sdp_drop(ssk, 0);
 		KASSERT(ssk != NULL,
 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
 	} else {
 		soisdisconnecting(so);
 		unread = sbused(&so->so_rcv);
 		sbflush(&so->so_rcv);
 		sdp_usrclosed(ssk);
 		if (!(ssk->flags & SDP_DROPPED)) {
 			if (unread)
 				sdp_output_reset(ssk);
 			else
 				sdp_output_disconnect(ssk);
 		}
 	}
 }
 
 /*
  * User initiated disconnect.
  */
 static int
 sdp_disconnect(struct socket *so)
 {
 	struct sdp_sock *ssk;
 	int error = 0;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	sdp_start_disconnect(ssk);
 out:
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 /*
  * Accept a connection.  Essentially all the work is done at higher levels;
  * just return the address of the peer, storing through addr.
  *
  *
  * XXX This is broken XXX
  * 
  * The rationale for acquiring the sdp lock here is somewhat complicated,
  * and is described in detail in the commit log entry for r175612.  Acquiring
  * it delays an accept(2) racing with sonewconn(), which inserts the socket
  * before the address/port fields are initialized.  A better fix would
  * prevent the socket from being placed in the listen queue until all fields
  * are fully initialized.
  */
 static int
 sdp_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct sdp_sock *ssk = NULL;
 	struct in_addr addr;
 	in_port_t port;
 	int error;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	port = 0;
 	addr.s_addr = 0;
 	error = 0;
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNABORTED;
 		goto out;
 	}
 	port = ssk->fport;
 	addr.s_addr = ssk->faddr;
 out:
 	SDP_WUNLOCK(ssk);
 	if (error == 0)
 		*nam = sdp_sockaddr(port, &addr);
 	return error;
 }
 
 /*
  * Mark the connection as being incapable of further output.
  */
 static int
 sdp_shutdown(struct socket *so)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	socantsendmore(so);
 	sdp_usrclosed(ssk);
 	if (!(ssk->flags & SDP_DROPPED))
 		sdp_output_disconnect(ssk);
 
 out:
 	SDP_WUNLOCK(ssk);
 
 	return (error);
 }
 
 static void
 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
 {
 	struct mbuf *n;
 	int ncnt;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	SBLASTRECORDCHK(sb);
 	KASSERT(mb->m_flags & M_PKTHDR,
 		("sdp_append: %p Missing packet header.\n", mb));
 	n = sb->sb_lastrecord;
 	/*
 	 * If the queue is empty just set all pointers and proceed.
 	 */
 	if (n == NULL) {
 		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
 		for (; mb; mb = mb->m_next) {
 	                sb->sb_mbtail = mb;
 			sballoc(sb, mb);
 		}
 		return;
 	}
 	/*
 	 * Count the number of mbufs in the current tail.
 	 */
 	for (ncnt = 0; n->m_next; n = n->m_next)
 		ncnt++;
 	n = sb->sb_lastrecord;
 	/*
 	 * If the two chains can fit in a single sdp packet and
 	 * the last record has not been sent yet (WRITABLE) coalesce
 	 * them.  The lastrecord remains the same but we must strip the
 	 * packet header and then let sbcompress do the hard part.
 	 */
 	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
 	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
 	    ssk->xmit_size_goal) {
 		m_adj(mb, SDP_HEAD_SIZE);
 		n->m_pkthdr.len += mb->m_pkthdr.len;
 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
 		m_demote(mb, 1, 0);
 		sbcompress(sb, mb, sb->sb_mbtail);
 		return;
 	}
 	/*
 	 * Not compressible, just append to the end and adjust counters.
 	 */
 	sb->sb_lastrecord->m_flags |= M_PUSH;
 	sb->sb_lastrecord->m_nextpkt = mb;
 	sb->sb_lastrecord = mb;
 	if (sb->sb_sndptr == NULL)
 		sb->sb_sndptr = mb;
 	for (; mb; mb = mb->m_next) {
 		sb->sb_mbtail = mb;
 		sballoc(sb, mb);
 	}
 }
 
 /*
  * Do a send by putting data in output queue and updating urgent
  * marker if URG set.  Possibly send more data.  Unlike the other
  * pru_*() routines, the mbuf chains are our responsibility.  We
  * must either enqueue them or free them.  The other pru_* routines
  * generally are caller-frees.
  *
  * This comes from sendfile, normal sends will come from sdp_sosend().
  */
 static int
 sdp_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	struct sdp_sock *ssk;
 	struct mbuf *n;
 	int error;
 	int cnt;
 
 	error = 0;
 	ssk = sdp_sk(so);
 	KASSERT(m->m_flags & M_PKTHDR,
 	    ("sdp_send: %p no packet header", m));
 	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
 	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 
 	for (n = m, cnt = 0; n->m_next; n = n->m_next)
 		cnt++;
 	if (cnt > SDP_MAX_SEND_SGES) {
 		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
 		if (n == NULL) {
 			m_freem(m);
 			return (EMSGSIZE);
 		}
 		m = n;
 		for (cnt = 0; n->m_next; n = n->m_next)
 			cnt++;
 	}
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		if (control)
 			m_freem(control);
 		if (m)
 			m_freem(m);
 		error = ECONNRESET;
 		goto out;
 	}
 	if (control) {
 		/* SDP doesn't support control messages. */
 		if (control->m_len) {
 			m_freem(control);
 			if (m)
 				m_freem(m);
 			error = EINVAL;
 			goto out;
 		}
 		m_freem(control);	/* empty control, just free it */
 	}
 	if (!(flags & PRUS_OOB)) {
 		SOCKBUF_LOCK(&so->so_snd);
 		sdp_append(ssk, &so->so_snd, m, cnt);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (nam && ssk->state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected.
 			 */
 			error = sdp_start_connect(ssk, nam, td);
 			if (error)
 				goto out;
 		}
 		if (flags & PRUS_EOF) {
 			/*
 			 * Close the send side of the connection after
 			 * the data is sent.
 			 */
 			socantsendmore(so);
 			sdp_usrclosed(ssk);
 			if (!(ssk->flags & SDP_DROPPED))
 				sdp_output_disconnect(ssk);
 		} else if (!(ssk->flags & SDP_DROPPED) &&
 		    !(flags & PRUS_MORETOCOME))
 			sdp_post_sends(ssk, M_NOWAIT);
 		SDP_WUNLOCK(ssk);
 		return (0);
 	} else {
 		SOCKBUF_LOCK(&so->so_snd);
 		if (sbspace(&so->so_snd) < -512) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			m_freem(m);
 			error = ENOBUFS;
 			goto out;
 		}
 		/*
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section.
 		 * Otherwise, snd_up should be one lower.
 		 */
 		m->m_flags |= M_URG | M_PUSH;
 		sdp_append(ssk, &so->so_snd, m, cnt);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (nam && ssk->state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected.
 			 */
 			error = sdp_start_connect(ssk, nam, td);
 			if (error)
 				goto out;
 		}
 		sdp_post_sends(ssk, M_NOWAIT);
 		SDP_WUNLOCK(ssk);
 		return (0);
 	}
 out:
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 
 /*
  * Send on a socket.  If send must go all at once and message is larger than
  * send buffering, then hard error.  Lock against other senders.  If must go
  * all at once and not enough room now, then inform user that this would
  * block and do nothing.  Otherwise, if nonblocking, send as much as
  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
  * in mbuf chain must be small enough to send all at once.
  *
  * Returns nonzero on error, timeout or signal; callers must check for short
  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
  * on return.
  */
 static int
 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	struct sdp_sock *ssk;
 	long space, resid;
 	int atomic;
 	int error;
 	int copy;
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
 		resid = top->m_pkthdr.len;
 	atomic = top != NULL;
 	if (control != NULL) {
 		if (control->m_len) {
 			m_freem(control);
 			if (top)
 				m_freem(top);
 			return (EINVAL);
 		}
 		m_freem(control);
 		control = NULL;
 	}
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 *
 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 	 * type sockets since that's an error.
 	 */
 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
 		error = EINVAL;
 		goto out;
 	}
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 
 	ssk = sdp_sk(so);
 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
 	if (error)
 		goto out;
 
 restart:
 	do {
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EPIPE;
 			goto release;
 		}
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto release;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOTCONN;
 			goto release;
 		}
 		space = sbspace(&so->so_snd);
 		if (flags & MSG_OOB)
 			space += 1024;
 		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EMSGSIZE;
 			goto release;
 		}
 		if (space < resid &&
 		    (atomic || space < so->so_snd.sb_lowat)) {
 			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EWOULDBLOCK;
 				goto release;
 			}
 			error = sbwait(&so->so_snd);
 			SOCKBUF_UNLOCK(&so->so_snd);
 			if (error)
 				goto release;
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 		do {
 			if (uio == NULL) {
 				resid = 0;
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 			} else {
 				/*
 				 * Copy the data from userland into a mbuf
 				 * chain.  If no data is to be copied in,
 				 * a single empty mbuf is returned.
 				 */
 				copy = min(space,
 				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
 				top = m_uiotombuf(uio, M_WAITOK, copy,
 				    0, M_PKTHDR |
 				    ((flags & MSG_EOR) ? M_EOR : 0));
 				if (top == NULL) {
 					/* only possible error */
 					error = EFAULT;
 					goto release;
 				}
 				space -= resid - uio->uio_resid;
 				resid = uio->uio_resid;
 			}
 			/*
 			 * XXX all the SBS_CANTSENDMORE checks previously
 			 * done could be out of date after dropping the
 			 * socket lock.
 			 */
 			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
 			/*
 			 * Set EOF on the last send if the user specified
 			 * MSG_EOF.
 			 */
 			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
 			/* If there is more to send set PRUS_MORETOCOME. */
 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
 			    top, addr, NULL, td);
 			top = NULL;
 			if (error)
 				goto release;
 		} while (resid && space > 0);
 	} while (resid);
 
 release:
 	sbunlock(&so->so_snd);
 out:
 	if (top != NULL)
 		m_freem(top);
 	return (error);
 }
 
 /*
  * The part of soreceive() that implements reading non-inline out-of-band
  * data from a socket.  For more complete comments, see soreceive(), from
  * which this code originated.
  *
  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  * unable to return an mbuf chain to the caller.
  */
 static int
 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
 {
 	struct protosw *pr = so->so_proto;
 	struct mbuf *m;
 	int error;
 
 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
 
 	m = m_get(M_WAITOK, MT_DATA);
 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
 	if (error)
 		goto bad;
 	do {
 		error = uiomove(mtod(m, void *),
 		    (int) min(uio->uio_resid, m->m_len), uio);
 		m = m_free(m);
 	} while (uio->uio_resid && error == 0 && m);
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for stream (TCP) sockets.
  */
 static int
 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int len = 0, error = 0, flags, oresid;
 	struct sockbuf *sb;
 	struct mbuf *m, *n = NULL;
 	struct sdp_sock *ssk;
 
 	/* We only do stream sockets. */
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		return (EINVAL);
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp0 != NULL)
 		*mp0 = NULL;
 
 	sb = &so->so_rcv;
 	ssk = sdp_sk(so);
 
 	/* Prevent other readers from entering the socket. */
 	error = sblock(sb, SBLOCKWAIT(flags));
 	if (error)
 		goto out;
 	SOCKBUF_LOCK(sb);
 
 	/* Easy one, no space to copyout anything. */
 	if (uio->uio_resid == 0) {
 		error = EINVAL;
 		goto out;
 	}
 	oresid = uio->uio_resid;
 
 	/* We will never ever get anything unless we are connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		/* When disconnecting there may be still some data left. */
 		if (sbavail(sb))
 			goto deliver;
 		if (!(so->so_state & SS_ISDISCONNECTED))
 			error = ENOTCONN;
 		goto out;
 	}
 
 	/* Socket buffer is empty and we shall not block. */
 	if (sbavail(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
 	}
 
 restart:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
 		if (sbavail(sb))
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
 		error = so->so_error;
 		if (!(flags & MSG_PEEK))
 			so->so_error = 0;
 		goto out;
 	}
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
 		if (sbavail(sb))
 			goto deliver;
 		else
 			goto out;
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
 	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sbavail(sb) >= sb->sb_lowat ||
 	     sbavail(sb) >= uio->uio_resid ||
 	     sbavail(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
 		goto deliver;
 
 	/*
 	 * Wait and block until (more) data comes in.
 	 * NB: Drops the sockbuf lock during wait.
 	 */
 	error = sbwait(sb);
 	if (error)
 		goto out;
 	goto restart;
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	/* Statistics. */
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
 	len = min(uio->uio_resid, sbavail(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
 			for (*mp0 = m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
 				sbfree(sb, m);
 				n = m;
 			}
 			sb->sb_mb = m;
 			if (sb->sb_mb == NULL)
 				SB_EMPTY_FIXUP(sb);
 			n->m_next = NULL;
 		}
 		/* Copy the remainder. */
 		if (len > 0) {
 			KASSERT(sb->sb_mb != NULL,
 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
 
 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
 			if (m == NULL)
 				len = 0;	/* Don't flush data from sockbuf. */
 			else
 				uio->uio_resid -= m->m_len;
 			if (*mp0 != NULL)
 				n->m_next = m;
 			else
 				*mp0 = m;
 			if (*mp0 == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 	} else {
 		/* NB: Must unlock socket buffer as uiomove may sleep. */
 		SOCKBUF_UNLOCK(sb);
 		error = m_mbuftouio(uio, sb->sb_mb, len);
 		SOCKBUF_LOCK(sb);
 		if (error)
 			goto out;
 	}
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 
 	/*
 	 * Remove the delivered data from the socket buffer unless we
 	 * were only peeking.
 	 */
 	if (!(flags & MSG_PEEK)) {
 		if (len > 0)
 			sbdrop_locked(sb, len);
 
 		/* Notify protocol that we drained some data. */
 		SOCKBUF_UNLOCK(sb);
 		SDP_WLOCK(ssk);
 		sdp_do_posts(ssk);
 		SDP_WUNLOCK(ssk);
 		SOCKBUF_LOCK(sb);
 	}
 
 	/*
 	 * For MSG_WAITALL we may have to loop again and wait for
 	 * more data to come in.
 	 */
 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
 		goto restart;
 out:
 	SOCKBUF_LOCK_ASSERT(sb);
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 	SOCKBUF_UNLOCK(sb);
 	sbunlock(sb);
 	return (error);
 }
 
 /*
  * Abort is used to teardown a connection typically while sitting in
  * the accept queue.
  */
 void
 sdp_abort(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	/*
 	 * If we have not yet dropped, do it now.
 	 */
 	if (!(ssk->flags & SDP_TIMEWAIT) &&
 	    !(ssk->flags & SDP_DROPPED))
 		sdp_drop(ssk, ECONNABORTED);
 	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
 	    ssk, ssk->flags));
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * Close a SDP socket and initiate a friendly disconnect.
  */
 static void
 sdp_close(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	/*
 	 * If we have not yet dropped, do it now.
 	 */
 	if (!(ssk->flags & SDP_TIMEWAIT) &&
 	    !(ssk->flags & SDP_DROPPED)) 
 		sdp_start_disconnect(ssk);
 
 	/*
 	 * If we've still not dropped let the socket layer know we're
 	 * holding on to the socket and pcb for a while.
 	 */
 	if (!(ssk->flags & SDP_DROPPED)) {
 		SOCK_LOCK(so);
 		so->so_state |= SS_PROTOREF;
 		SOCK_UNLOCK(so);
 		ssk->flags |= SDP_SOCKREF;
 	}
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * User requests out-of-band data.
  */
 static int
 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (!rx_ring_trylock(&ssk->rx_ring)) {
 		SDP_WUNLOCK(ssk);
 		return (ECONNRESET);
 	}
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	if ((so->so_oobmark == 0 &&
 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
 	    so->so_options & SO_OOBINLINE ||
 	    ssk->oobflags & SDP_HADOOB) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
 		error = EWOULDBLOCK;
 		goto out;
 	}
 	m->m_len = 1;
 	*mtod(m, caddr_t) = ssk->iobc;
 	if ((flags & MSG_PEEK) == 0)
 		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
 out:
 	rx_ring_unlock(&ssk->rx_ring);
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 void
 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
 {
 	struct mbuf *m;
 	struct socket *so;
 
 	so = ssk->socket;
 	if (so == NULL)
 		return;
 
 	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
 	sohasoutofband(so);
 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
 	if (!(so->so_options & SO_OOBINLINE)) {
 		for (m = mb; m->m_next != NULL; m = m->m_next);
 		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
 		ssk->oobflags |= SDP_HAVEOOB;
 		m->m_len--;
 		mb->m_pkthdr.len--;
 	}
 }
 
 /*
  * Notify a sdp socket of an asynchronous error.
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 struct sdp_sock *
 sdp_notify(struct sdp_sock *ssk, int error)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	if ((ssk->flags & SDP_TIMEWAIT) ||
 	    (ssk->flags & SDP_DROPPED))
 		return (ssk);
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 */
 	if (ssk->state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN))
 		return (ssk);
 	ssk->softerror = error;
 	return sdp_drop(ssk, error);
 }
 
 static void
 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct in_addr faddr;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
 }
 
 static int
 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
     struct thread *td)
 {
 	return (EOPNOTSUPP);
 }
 
 static void
 sdp_keepalive_timeout(void *data)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	/* Callout canceled. */
         if (!callout_active(&ssk->keep2msl))
                 return;
 	/* Callout rescheduled as a different kind of timer. */
 	if (callout_pending(&ssk->keep2msl))
 		goto out;
         callout_deactivate(&ssk->keep2msl);
 	if (ssk->flags & SDP_DROPPED ||
 	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
 		goto out;
 	sdp_post_keepalive(ssk);
 	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
 	    sdp_keepalive_timeout, ssk);
 out:
 	SDP_WUNLOCK(ssk);
 }
 
 
 void
 sdp_start_keepalive_timer(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	if (!callout_pending(&ssk->keep2msl))
                 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
                     sdp_keepalive_timeout, ssk);
 }
 
 static void
 sdp_stop_keepalive_timer(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	callout_stop(&ssk->keep2msl);
 }
 
 /*
  * sdp_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 #define SDP_WLOCK_RECHECK(inp) do {					\
 	SDP_WLOCK(ssk);							\
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
 		SDP_WUNLOCK(ssk);					\
 		return (ECONNRESET);					\
 	}								\
 } while(0)
 
 static int
 sdp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int	error, opt, optval;
 	struct sdp_sock *ssk;
 
 	error = 0;
 	ssk = sdp_sk(so);
 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
 		SDP_WLOCK(ssk);
 		if (so->so_options & SO_KEEPALIVE)
 			sdp_start_keepalive_timer(so);
 		else
 			sdp_stop_keepalive_timer(so);
 		SDP_WUNLOCK(ssk);
 	}
 	if (sopt->sopt_level != IPPROTO_TCP)
 		return (error);
 
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		SDP_WUNLOCK(ssk);
 		return (ECONNRESET);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case TCP_NODELAY:
 			SDP_WUNLOCK(ssk);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			SDP_WLOCK_RECHECK(ssk);
 			opt = SDP_NODELAY;
 			if (optval)
 				ssk->flags |= opt;
 			else
 				ssk->flags &= ~opt;
 			sdp_do_posts(ssk);
 			SDP_WUNLOCK(ssk);
 			break;
 
 		default:
 			SDP_WUNLOCK(ssk);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case TCP_NODELAY:
 			optval = ssk->flags & SDP_NODELAY;
 			SDP_WUNLOCK(ssk);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		default:
 			SDP_WUNLOCK(ssk);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 #undef SDP_WLOCK_RECHECK
 
 int sdp_mod_count = 0;
 int sdp_mod_usec = 0;
 
 void
 sdp_set_default_moderation(struct sdp_sock *ssk)
 {
 	struct ib_cq_attr attr;
 	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
 		return;
 	memset(&attr, 0, sizeof(attr));
 	attr.moderation.cq_count = sdp_mod_count;
 	attr.moderation.cq_period = sdp_mod_usec;
 
 	ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION);
 }
 
 static void
 sdp_dev_add(struct ib_device *device)
 {
 	struct ib_fmr_pool_param param;
 	struct sdp_device *sdp_dev;
 
 	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
 	sdp_dev->pd = ib_alloc_pd(device);
 	if (IS_ERR(sdp_dev->pd))
 		goto out_pd;
         sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
         if (IS_ERR(sdp_dev->mr))
 		goto out_mr;
 	memset(&param, 0, sizeof param);
 	param.max_pages_per_fmr = SDP_FMR_SIZE;
 	param.page_shift = PAGE_SHIFT;
 	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
 	param.pool_size = SDP_FMR_POOL_SIZE;
 	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
 	param.cache = 1;
 	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
 	if (IS_ERR(sdp_dev->fmr_pool))
 		goto out_fmr;
 	ib_set_client_data(device, &sdp_client, sdp_dev);
 	return;
 
 out_fmr:
 	ib_dereg_mr(sdp_dev->mr);
 out_mr:
 	ib_dealloc_pd(sdp_dev->pd);
 out_pd:
 	free(sdp_dev, M_SDP);
 }
 
 static void
 sdp_dev_rem(struct ib_device *device)
 {
 	struct sdp_device *sdp_dev;
 	struct sdp_sock *ssk;
 
 	SDP_LIST_WLOCK();
 	LIST_FOREACH(ssk, &sdp_list, list) {
 		if (ssk->ib_device != device)
 			continue;
 		SDP_WLOCK(ssk);
 		if ((ssk->flags & SDP_DESTROY) == 0)
 			ssk = sdp_notify(ssk, ECONNRESET);
 		if (ssk)
 			SDP_WUNLOCK(ssk);
 	}
 	SDP_LIST_WUNLOCK();
 	/*
 	 * XXX Do I need to wait between these two?
 	 */
 	sdp_dev = ib_get_client_data(device, &sdp_client);
 	if (!sdp_dev)
 		return;
 	ib_flush_fmr_pool(sdp_dev->fmr_pool);
 	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
 	ib_dereg_mr(sdp_dev->mr);
 	ib_dealloc_pd(sdp_dev->pd);
 	free(sdp_dev, M_SDP);
 }
 
 struct ib_client sdp_client =
     { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
 
 
 static int
 sdp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, n, i;
 	struct sdp_sock *ssk;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = sdp_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	SDP_LIST_RLOCK();
 	n = sdp_count;
 	SDP_LIST_RUNLOCK();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = 0;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	SDP_LIST_RLOCK();
 	for (ssk = LIST_FIRST(&sdp_list), i = 0;
 	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
 		struct xtcpcb xt;
 
 		SDP_RLOCK(ssk);
 		if (ssk->flags & SDP_TIMEWAIT) {
 			if (ssk->cred != NULL)
 				error = cr_cansee(req->td->td_ucred,
 				    ssk->cred);
 			else
 				error = EINVAL;	/* Skip this inp. */
 		} else if (ssk->socket)
 			error = cr_canseesocket(req->td->td_ucred,
 			    ssk->socket);
 		else
 			error = EINVAL;
 		if (error) {
 			error = 0;
 			goto next;
 		}
 
 		bzero(&xt, sizeof(xt));
 		xt.xt_len = sizeof xt;
 		xt.xt_inp.inp_gencnt = 0;
 		xt.xt_inp.inp_vflag = INP_IPV4;
 		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
 		xt.xt_inp.inp_lport = ssk->lport;
 		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
 		xt.xt_inp.inp_fport = ssk->fport;
 		xt.xt_tp.t_state = ssk->state;
 		if (ssk->socket != NULL)
 			sotoxsocket(ssk->socket, &xt.xt_socket);
 		else
 			bzero(&xt.xt_socket, sizeof xt.xt_socket);
 		xt.xt_socket.xso_protocol = IPPROTO_TCP;
 		SDP_RUNLOCK(ssk);
 		error = SYSCTL_OUT(req, &xt, sizeof xt);
 		if (error)
 			break;
 		i++;
 		continue;
 next:
 		SDP_RUNLOCK(ssk);
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xig.xig_gen = 0;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = sdp_count;
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	SDP_LIST_RUNLOCK();
 	return (error);
 }
 
 static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
 
 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
     CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
     "List of active SDP connections");
 
 static void
 sdp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(sdp_zone, maxsockets);
 }
 
 static void
 sdp_init(void)
 {
 
 	LIST_INIT(&sdp_list);
 	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(sdp_zone, maxsockets);
 	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
 	ib_register_client(&sdp_client);
 }
 
 extern struct domain sdpdomain;
 
 struct pr_usrreqs sdp_usrreqs = {
 	.pru_abort =		sdp_abort,
 	.pru_accept =		sdp_accept,
 	.pru_attach =		sdp_attach,
 	.pru_bind =		sdp_bind,
 	.pru_connect =		sdp_connect,
 	.pru_control =		sdp_control,
 	.pru_detach =		sdp_detach,
 	.pru_disconnect =	sdp_disconnect,
 	.pru_listen =		sdp_listen,
 	.pru_peeraddr =		sdp_getpeeraddr,
 	.pru_rcvoob =		sdp_rcvoob,
 	.pru_send =		sdp_send,
 	.pru_sosend =		sdp_sosend,
 	.pru_soreceive =	sdp_sorecv,
 	.pru_shutdown =		sdp_shutdown,
 	.pru_sockaddr =		sdp_getsockaddr,
 	.pru_close =		sdp_close,
 };
 
 struct protosw sdpsw[] = {
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&sdpdomain,
 	.pr_protocol =		IPPROTO_IP,
 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
 	.pr_ctlinput =		sdp_ctlinput,
 	.pr_ctloutput =		sdp_ctloutput,
 	.pr_usrreqs =		&sdp_usrreqs
 },
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&sdpdomain,
 	.pr_protocol =		IPPROTO_TCP,
 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
 	.pr_ctlinput =		sdp_ctlinput,
 	.pr_ctloutput =		sdp_ctloutput,
 	.pr_usrreqs =		&sdp_usrreqs
 },
 };
 
 struct domain sdpdomain = {
 	.dom_family =		AF_INET_SDP,
 	.dom_name =		"SDP",
 	.dom_init =		sdp_init,
 	.dom_protosw =		sdpsw,
 	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
 };
 
 DOMAIN_SET(sdp);
 
 int sdp_debug_level = 1;
 int sdp_data_debug_level = 0;
Index: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c	(revision 303517)
@@ -1,782 +1,754 @@
 /*
  * Copyright (c) 2009 Mellanox Technologies Ltd.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "sdp.h"
 
 SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024,
 		"Receive buffer initial size in bytes.");
 SDP_MODPARAM_SINT(rcvbuf_scale, 0x8,
 		"Receive buffer size scale factor.");
 
 /* Like tcp_fin - called when SDP_MID_DISCONNECT is received */
 static void
 sdp_handle_disconn(struct sdp_sock *ssk)
 {
 
 	sdp_dbg(ssk->socket, "%s\n", __func__);
 
 	SDP_WLOCK_ASSERT(ssk);
 	if (TCPS_HAVERCVDFIN(ssk->state) == 0)
 		socantrcvmore(ssk->socket);
 
 	switch (ssk->state) {
 	case TCPS_SYN_RECEIVED:
 	case TCPS_ESTABLISHED:
 		ssk->state = TCPS_CLOSE_WAIT;
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		/* Received a reply FIN - start Infiniband tear down */
 		sdp_dbg(ssk->socket,
 		    "%s: Starting Infiniband tear down sending DREQ\n",
 		    __func__);
 
 		sdp_cancel_dreq_wait_timeout(ssk);
 		ssk->qp_active = 0;
 		if (ssk->id) {
 			struct rdma_cm_id *id;
 
 			id = ssk->id;
 			SDP_WUNLOCK(ssk);
 			rdma_disconnect(id);
 			SDP_WLOCK(ssk);
 		} else {
 			sdp_warn(ssk->socket,
 			    "%s: ssk->id is NULL\n", __func__);
 			return;
 		}
 		break;
 	case TCPS_TIME_WAIT:
 		/* This is a mutual close situation and we've got the DREQ from
 		   the peer before the SDP_MID_DISCONNECT */
 		break;
 	case TCPS_CLOSED:
 		/* FIN arrived after IB teardown started - do nothing */
 		sdp_dbg(ssk->socket, "%s: fin in state %s\n",
 		    __func__, sdp_state_str(ssk->state));
 		return;
 	default:
 		sdp_warn(ssk->socket,
 		    "%s: FIN in unexpected state. state=%d\n",
 		    __func__, ssk->state);
 		break;
 	}
 }
 
 static int
 sdp_post_recv(struct sdp_sock *ssk)
 {
 	struct sdp_buf *rx_req;
 	int i, rc;
 	u64 addr;
 	struct ib_device *dev;
 	struct ib_recv_wr rx_wr = { NULL };
 	struct ib_sge ibsge[SDP_MAX_RECV_SGES];
 	struct ib_sge *sge = ibsge;
 	struct ib_recv_wr *bad_wr;
 	struct mbuf *mb, *m;
 	struct sdp_bsdh *h;
 	int id = ring_head(ssk->rx_ring);
 
 	/* Now, allocate and repost recv */
 	sdp_prf(ssk->socket, mb, "Posting mb");
 	mb = m_getm2(NULL, ssk->recv_bytes, M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (mb == NULL) {
 		/* Retry so we can't stall out with no memory. */
 		if (!rx_ring_posted(ssk))
 			queue_work(rx_comp_wq, &ssk->rx_comp_work);
 		return -1;
 	}
 	for (m = mb; m != NULL; m = m->m_next) {
 		m->m_len = M_SIZE(m);
 		mb->m_pkthdr.len += m->m_len;
 	}
 	h = mtod(mb, struct sdp_bsdh *);
 	rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1));
 	rx_req->mb = mb;
 	dev = ssk->ib_device;
         for (i = 0;  mb != NULL; i++, mb = mb->m_next, sge++) {
 		addr = ib_dma_map_single(dev, mb->m_data, mb->m_len,
 		    DMA_TO_DEVICE);
 		/* TODO: proper error handling */
 		BUG_ON(ib_dma_mapping_error(dev, addr));
 		BUG_ON(i >= SDP_MAX_RECV_SGES);
 		rx_req->mapping[i] = addr;
 		sge->addr = addr;
 		sge->length = mb->m_len;
 		sge->lkey = ssk->sdp_dev->mr->lkey;
         }
 
 	rx_wr.next = NULL;
 	rx_wr.wr_id = id | SDP_OP_RECV;
 	rx_wr.sg_list = ibsge;
 	rx_wr.num_sge = i;
 	rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr);
 	if (unlikely(rc)) {
 		sdp_warn(ssk->socket, "ib_post_recv failed. status %d\n", rc);
 
 		sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE);
 		m_freem(mb);
 
 		sdp_notify(ssk, ECONNRESET);
 
 		return -1;
 	}
 
 	atomic_inc(&ssk->rx_ring.head);
 	SDPSTATS_COUNTER_INC(post_recv);
 
 	return 0;
 }
 
 static inline int
 sdp_post_recvs_needed(struct sdp_sock *ssk)
 {
 	unsigned long bytes_in_process;
 	unsigned long max_bytes;
 	int buffer_size;
 	int posted;
 
 	if (!ssk->qp_active || !ssk->socket)
 		return 0;
 
 	posted = rx_ring_posted(ssk);
 	if (posted >= SDP_RX_SIZE)
 		return 0;
 	if (posted < SDP_MIN_TX_CREDITS)
 		return 1;
 
 	buffer_size = ssk->recv_bytes;
-	max_bytes = max(ssk->socket->so_snd.sb_hiwat,
+	max_bytes = max(ssk->socket->so_rcv.sb_hiwat,
 	    (1 + SDP_MIN_TX_CREDITS) * buffer_size);
 	max_bytes *= rcvbuf_scale;
 	/*
 	 * Compute bytes in the receive queue and socket buffer.
 	 */
 	bytes_in_process = (posted - SDP_MIN_TX_CREDITS) * buffer_size;
 	bytes_in_process += sbused(&ssk->socket->so_rcv);
 
 	return bytes_in_process < max_bytes;
 }
 
 static inline void
 sdp_post_recvs(struct sdp_sock *ssk)
 {
 
 	while (sdp_post_recvs_needed(ssk))
 		if (sdp_post_recv(ssk))
 			return;
 }
 
 static inline struct mbuf *
 sdp_sock_queue_rcv_mb(struct socket *sk, struct mbuf *mb)
 {
 	struct sdp_sock *ssk = sdp_sk(sk);
 	struct sdp_bsdh *h;
 
 	h = mtod(mb, struct sdp_bsdh *);
 
 #ifdef SDP_ZCOPY
 	SDP_SKB_CB(mb)->seq = rcv_nxt(ssk);
 	if (h->mid == SDP_MID_SRCAVAIL) {
 		struct sdp_srcah *srcah = (struct sdp_srcah *)(h+1);
 		struct rx_srcavail_state *rx_sa;
 		
 		ssk->srcavail_cancel_mseq = 0;
 
 		ssk->rx_sa = rx_sa = RX_SRCAVAIL_STATE(mb) = kzalloc(
 				sizeof(struct rx_srcavail_state), M_NOWAIT);
 
 		rx_sa->mseq = ntohl(h->mseq);
 		rx_sa->used = 0;
 		rx_sa->len = mb_len = ntohl(srcah->len);
 		rx_sa->rkey = ntohl(srcah->rkey);
 		rx_sa->vaddr = be64_to_cpu(srcah->vaddr);
 		rx_sa->flags = 0;
 
 		if (ssk->tx_sa) {
 			sdp_dbg_data(ssk->socket, "got RX SrcAvail while waiting "
 					"for TX SrcAvail. waking up TX SrcAvail"
 					"to be aborted\n");
 			wake_up(sk->sk_sleep);
 		}
 
 		atomic_add(mb->len, &ssk->rcv_nxt);
 		sdp_dbg_data(sk, "queueing SrcAvail. mb_len = %d vaddr = %lld\n",
 			mb_len, rx_sa->vaddr);
 	} else
 #endif
 	{
 		atomic_add(mb->m_pkthdr.len, &ssk->rcv_nxt);
 	}
 
 	m_adj(mb, SDP_HEAD_SIZE);
 	SOCKBUF_LOCK(&sk->so_rcv);
 	if (unlikely(h->flags & SDP_OOB_PRES))
 		sdp_urg(ssk, mb);
 	sbappend_locked(&sk->so_rcv, mb, 0);
 	sorwakeup_locked(sk);
 	return mb;
 }
 
 static int
 sdp_get_recv_bytes(struct sdp_sock *ssk, u32 new_size)
 {
 
 	return MIN(new_size, SDP_MAX_PACKET);
 }
 
 int
 sdp_init_buffers(struct sdp_sock *ssk, u32 new_size)
 {
 
 	ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size);
 	sdp_post_recvs(ssk);
 
 	return 0;
 }
 
 int
 sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size)
 {
 	u32 curr_size = ssk->recv_bytes;
 	u32 max_size = SDP_MAX_PACKET;
 
 	if (new_size > curr_size && new_size <= max_size) {
 		ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size);
 		return 0;
 	}
 	return -1;
 }
 
 static void
 sdp_handle_resize_request(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf)
 {
 	if (sdp_resize_buffers(ssk, ntohl(buf->size)) == 0)
 		ssk->recv_request_head = ring_head(ssk->rx_ring) + 1;
 	else
 		ssk->recv_request_head = ring_tail(ssk->rx_ring);
 	ssk->recv_request = 1;
 }
 
 static void
 sdp_handle_resize_ack(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf)
 {
 	u32 new_size = ntohl(buf->size);
 
 	if (new_size > ssk->xmit_size_goal)
 		ssk->xmit_size_goal = new_size;
 }
 
 static struct mbuf *
 sdp_recv_completion(struct sdp_sock *ssk, int id)
 {
 	struct sdp_buf *rx_req;
 	struct ib_device *dev;
 	struct mbuf *mb;
 
 	if (unlikely(id != ring_tail(ssk->rx_ring))) {
 		printk(KERN_WARNING "Bogus recv completion id %d tail %d\n",
 			id, ring_tail(ssk->rx_ring));
 		return NULL;
 	}
 
 	dev = ssk->ib_device;
 	rx_req = &ssk->rx_ring.buffer[id & (SDP_RX_SIZE - 1)];
 	mb = rx_req->mb;
 	sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE);
 
 	atomic_inc(&ssk->rx_ring.tail);
 	atomic_dec(&ssk->remote_credits);
 	return mb;
 }
 
-/* socket lock should be taken before calling this */
-static int
+static void
 sdp_process_rx_ctl_mb(struct sdp_sock *ssk, struct mbuf *mb)
 {
 	struct sdp_bsdh *h;
 	struct socket *sk;
 
 	SDP_WLOCK_ASSERT(ssk);
+
 	sk = ssk->socket;
  	h = mtod(mb, struct sdp_bsdh *);
 	switch (h->mid) {
 	case SDP_MID_DATA:
 	case SDP_MID_SRCAVAIL:
 		sdp_dbg(sk, "DATA after socket rcv was shutdown\n");
 
 		/* got data in RCV_SHUTDOWN */
 		if (ssk->state == TCPS_FIN_WAIT_1) {
 			sdp_dbg(sk, "RX data when state = FIN_WAIT1\n");
 			sdp_notify(ssk, ECONNRESET);
 		}
-		m_freem(mb);
 
 		break;
 #ifdef SDP_ZCOPY
 	case SDP_MID_RDMARDCOMPL:
-		m_freem(mb);
 		break;
 	case SDP_MID_SENDSM:
 		sdp_handle_sendsm(ssk, ntohl(h->mseq_ack));
-		m_freem(mb);
 		break;
 	case SDP_MID_SRCAVAIL_CANCEL:
 		sdp_dbg_data(sk, "Handling SrcAvailCancel\n");
 		sdp_prf(sk, NULL, "Handling SrcAvailCancel");
 		if (ssk->rx_sa) {
 			ssk->srcavail_cancel_mseq = ntohl(h->mseq);
 			ssk->rx_sa->flags |= RX_SA_ABORTED;
 			ssk->rx_sa = NULL; /* TODO: change it into SDP_MID_DATA and get 
 			                      the dirty logic from recvmsg */
 		} else {
 			sdp_dbg(sk, "Got SrcAvailCancel - "
 					"but no SrcAvail in process\n");
 		}
-		m_freem(mb);
 		break;
 	case SDP_MID_SINKAVAIL:
 		sdp_dbg_data(sk, "Got SinkAvail - not supported: ignored\n");
 		sdp_prf(sk, NULL, "Got SinkAvail - not supported: ignored");
 		/* FALLTHROUGH */
 #endif
 	case SDP_MID_ABORT:
 		sdp_dbg_data(sk, "Handling ABORT\n");
 		sdp_prf(sk, NULL, "Handling ABORT");
 		sdp_notify(ssk, ECONNRESET);
-		m_freem(mb);
 		break;
 	case SDP_MID_DISCONN:
 		sdp_dbg_data(sk, "Handling DISCONN\n");
 		sdp_prf(sk, NULL, "Handling DISCONN");
 		sdp_handle_disconn(ssk);
 		break;
 	case SDP_MID_CHRCVBUF:
 		sdp_dbg_data(sk, "Handling RX CHRCVBUF\n");
 		sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1));
-		m_freem(mb);
 		break;
 	case SDP_MID_CHRCVBUF_ACK:
 		sdp_dbg_data(sk, "Handling RX CHRCVBUF_ACK\n");
 		sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1));
-		m_freem(mb);
 		break;
 	default:
 		/* TODO: Handle other messages */
 		sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid);
-		m_freem(mb);
+		break;
 	}
-
-	return 0;
+	m_freem(mb);
 }
 
 static int
 sdp_process_rx_mb(struct sdp_sock *ssk, struct mbuf *mb)
 {
 	struct socket *sk;
 	struct sdp_bsdh *h;
 	unsigned long mseq_ack;
 	int credits_before;
 
 	h = mtod(mb, struct sdp_bsdh *);
 	sk = ssk->socket;
 	/*
 	 * If another thread is in so_pcbfree this may be partially torn
 	 * down but no further synchronization is required as the destroying
 	 * thread will wait for receive to shutdown before discarding the
 	 * socket.
 	 */
 	if (sk == NULL) {
 		m_freem(mb);
 		return 0;
 	}
 
 	SDPSTATS_HIST_LINEAR(credits_before_update, tx_credits(ssk));
 
 	mseq_ack = ntohl(h->mseq_ack);
 	credits_before = tx_credits(ssk);
 	atomic_set(&ssk->tx_ring.credits, mseq_ack - ring_head(ssk->tx_ring) +
 			1 + ntohs(h->bufs));
 	if (mseq_ack >= ssk->nagle_last_unacked)
 		ssk->nagle_last_unacked = 0;
 
 	sdp_prf1(ssk->socket, mb, "RX %s +%d c:%d->%d mseq:%d ack:%d\n",
 		mid2str(h->mid), ntohs(h->bufs), credits_before,
 		tx_credits(ssk), ntohl(h->mseq), ntohl(h->mseq_ack));
 
 	if (unlikely(h->mid == SDP_MID_DATA &&
 	    mb->m_pkthdr.len == SDP_HEAD_SIZE)) {
 		/* Credit update is valid even after RCV_SHUTDOWN */
 		m_freem(mb);
 		return 0;
 	}
 
 	if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL) ||
 	    TCPS_HAVERCVDFIN(ssk->state)) {
 		sdp_prf(sk, NULL, "Control mb - queing to control queue");
 #ifdef SDP_ZCOPY
 		if (h->mid == SDP_MID_SRCAVAIL_CANCEL) {
 			sdp_dbg_data(sk, "Got SrcAvailCancel. "
 					"seq: 0x%d seq_ack: 0x%d\n",
 					ntohl(h->mseq), ntohl(h->mseq_ack));
 			ssk->srcavail_cancel_mseq = ntohl(h->mseq);
 		}
 
 
 		if (h->mid == SDP_MID_RDMARDCOMPL) {
 			struct sdp_rrch *rrch = (struct sdp_rrch *)(h+1);
 			sdp_dbg_data(sk, "RdmaRdCompl message arrived\n");
 			sdp_handle_rdma_read_compl(ssk, ntohl(h->mseq_ack),
 					ntohl(rrch->len));
 		}
 #endif
-		mb->m_nextpkt = NULL;
-		if (ssk->rx_ctl_tail)
-			ssk->rx_ctl_tail->m_nextpkt = mb;
-		else
-			ssk->rx_ctl_q = mb;
-		ssk->rx_ctl_tail = mb;
-
-		return 0;
+		if (mbufq_enqueue(&ssk->rxctlq, mb) != 0)
+			m_freem(mb);
+		return (0);
 	}
 
 	sdp_prf1(sk, NULL, "queueing %s mb\n", mid2str(h->mid));
 	mb = sdp_sock_queue_rcv_mb(sk, mb);
 
 
 	return 0;
 }
 
 /* called only from irq */
 static struct mbuf *
 sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
 {
 	struct mbuf *mb;
 	struct sdp_bsdh *h;
 	struct socket *sk = ssk->socket;
 	int mseq;
 
 	mb = sdp_recv_completion(ssk, wc->wr_id);
 	if (unlikely(!mb))
 		return NULL;
 
 	if (unlikely(wc->status)) {
 		if (ssk->qp_active && sk) {
 			sdp_dbg(sk, "Recv completion with error. "
 					"Status %d, vendor: %d\n",
 				wc->status, wc->vendor_err);
 			sdp_abort(sk);
 			ssk->qp_active = 0;
 		}
 		m_freem(mb);
 		return NULL;
 	}
 
 	sdp_dbg_data(sk, "Recv completion. ID %d Length %d\n",
 			(int)wc->wr_id, wc->byte_len);
 	if (unlikely(wc->byte_len < sizeof(struct sdp_bsdh))) {
 		sdp_warn(sk, "SDP BUG! byte_len %d < %zd\n",
 				wc->byte_len, sizeof(struct sdp_bsdh));
 		m_freem(mb);
 		return NULL;
 	}
 	/* Use m_adj to trim the tail of data we didn't use. */
 	m_adj(mb, -(mb->m_pkthdr.len - wc->byte_len));
 	h = mtod(mb, struct sdp_bsdh *);
 
 	SDP_DUMP_PACKET(ssk->socket, "RX", mb, h);
 
 	ssk->rx_packets++;
 	ssk->rx_bytes += mb->m_pkthdr.len;
 
 	mseq = ntohl(h->mseq);
 	atomic_set(&ssk->mseq_ack, mseq);
 	if (mseq != (int)wc->wr_id)
 		sdp_warn(sk, "SDP BUG! mseq %d != wrid %d\n",
 				mseq, (int)wc->wr_id);
 
 	return mb;
 }
 
 /* Wakeup writers if we now have credits. */
 static void
 sdp_bzcopy_write_space(struct sdp_sock *ssk)
 {
 	struct socket *sk = ssk->socket;
 
 	if (tx_credits(ssk) >= ssk->min_bufs && sk)
 		sowwakeup(sk);
 }
 
 /* only from interrupt. */
 static int
 sdp_poll_rx_cq(struct sdp_sock *ssk)
 {
 	struct ib_cq *cq = ssk->rx_ring.cq;
 	struct ib_wc ibwc[SDP_NUM_WC];
 	int n, i;
 	int wc_processed = 0;
 	struct mbuf *mb;
 
 	do {
 		n = ib_poll_cq(cq, SDP_NUM_WC, ibwc);
 		for (i = 0; i < n; ++i) {
 			struct ib_wc *wc = &ibwc[i];
 
 			BUG_ON(!(wc->wr_id & SDP_OP_RECV));
 			mb = sdp_process_rx_wc(ssk, wc);
 			if (!mb)
 				continue;
 
 			sdp_process_rx_mb(ssk, mb);
 			wc_processed++;
 		}
 	} while (n == SDP_NUM_WC);
 
 	if (wc_processed)
 		sdp_bzcopy_write_space(ssk);
 
 	return wc_processed;
 }
 
 static void
 sdp_rx_comp_work(struct work_struct *work)
 {
 	struct sdp_sock *ssk = container_of(work, struct sdp_sock,
 			rx_comp_work);
 
 	sdp_prf(ssk->socket, NULL, "%s", __func__);
 
 	SDP_WLOCK(ssk);
 	if (unlikely(!ssk->qp)) {
 		sdp_prf(ssk->socket, NULL, "qp was destroyed");
 		goto out;
 	}
 	if (unlikely(!ssk->rx_ring.cq)) {
 		sdp_prf(ssk->socket, NULL, "rx_ring.cq is NULL");
 		goto out;
 	}
 
 	if (unlikely(!ssk->poll_cq)) {
 		struct rdma_cm_id *id = ssk->id;
 		if (id && id->qp)
 			rdma_notify(id, IB_EVENT_COMM_EST);
 		goto out;
 	}
 
 	sdp_do_posts(ssk);
 out:
 	SDP_WUNLOCK(ssk);
 }
 
 void
 sdp_do_posts(struct sdp_sock *ssk)
 {
 	struct socket *sk = ssk->socket;
 	int xmit_poll_force;
 	struct mbuf *mb;
 
 	SDP_WLOCK_ASSERT(ssk);
 	if (!ssk->qp_active) {
 		sdp_dbg(sk, "QP is deactivated\n");
 		return;
 	}
 
-	while ((mb = ssk->rx_ctl_q)) {
-		ssk->rx_ctl_q = mb->m_nextpkt;
-		mb->m_nextpkt = NULL;
+	while ((mb = mbufq_dequeue(&ssk->rxctlq)) != NULL)
 		sdp_process_rx_ctl_mb(ssk, mb);
-	}
 
 	if (ssk->state == TCPS_TIME_WAIT)
 		return;
 
 	if (!ssk->rx_ring.cq || !ssk->tx_ring.cq)
 		return;
 
 	sdp_post_recvs(ssk);
 
 	if (tx_ring_posted(ssk))
 		sdp_xmit_poll(ssk, 1);
 
 	sdp_post_sends(ssk, M_NOWAIT);
 
 	xmit_poll_force = tx_credits(ssk) < SDP_MIN_TX_CREDITS;
 
 	if (credit_update_needed(ssk) || xmit_poll_force) {
 		/* if has pending tx because run out of tx_credits - xmit it */
 		sdp_prf(sk, NULL, "Processing to free pending sends");
 		sdp_xmit_poll(ssk,  xmit_poll_force);
 		sdp_prf(sk, NULL, "Sending credit update");
 		sdp_post_sends(ssk, M_NOWAIT);
 	}
 
 }
 
 int
 sdp_process_rx(struct sdp_sock *ssk)
 {
 	int wc_processed = 0;
 	int credits_before;
 
 	if (!rx_ring_trylock(&ssk->rx_ring)) {
 		sdp_dbg(ssk->socket, "ring destroyed. not polling it\n");
 		return 0;
 	}
 
 	credits_before = tx_credits(ssk);
 
 	wc_processed = sdp_poll_rx_cq(ssk);
 	sdp_prf(ssk->socket, NULL, "processed %d", wc_processed);
 
 	if (wc_processed) {
 		sdp_prf(ssk->socket, NULL, "credits:  %d -> %d",
 				credits_before, tx_credits(ssk));
 		queue_work(rx_comp_wq, &ssk->rx_comp_work);
 	}
 	sdp_arm_rx_cq(ssk);
 
 	rx_ring_unlock(&ssk->rx_ring);
 
 	return (wc_processed);
 }
 
 static void
 sdp_rx_irq(struct ib_cq *cq, void *cq_context)
 {
-	struct socket *sk = cq_context;
-	struct sdp_sock *ssk = sdp_sk(sk);
+	struct sdp_sock *ssk;
 
-	if (cq != ssk->rx_ring.cq) {
-		sdp_dbg(sk, "cq = %p, ssk->cq = %p\n", cq, ssk->rx_ring.cq);
-		return;
-	}
+	ssk = cq_context;
+	KASSERT(cq == ssk->rx_ring.cq,
+	    ("%s: mismatched cq on %p", __func__, ssk));
 
 	SDPSTATS_COUNTER_INC(rx_int_count);
 
 	sdp_prf(sk, NULL, "rx irq");
 
 	sdp_process_rx(ssk);
 }
 
 static
 void sdp_rx_ring_purge(struct sdp_sock *ssk)
 {
 	while (rx_ring_posted(ssk) > 0) {
 		struct mbuf *mb;
 		mb = sdp_recv_completion(ssk, ring_tail(ssk->rx_ring));
 		if (!mb)
 			break;
 		m_freem(mb);
 	}
 }
 
 void
 sdp_rx_ring_init(struct sdp_sock *ssk)
 {
 	ssk->rx_ring.buffer = NULL;
 	ssk->rx_ring.destroyed = 0;
 	rw_init(&ssk->rx_ring.destroyed_lock, "sdp rx lock");
 }
 
 static void
 sdp_rx_cq_event_handler(struct ib_event *event, void *data)
 {
 }
 
 int
 sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
 {
 	struct ib_cq *rx_cq;
 	int rc = 0;
 
-
 	sdp_dbg(ssk->socket, "rx ring created");
 	INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work);
 	atomic_set(&ssk->rx_ring.head, 1);
 	atomic_set(&ssk->rx_ring.tail, 1);
 
-	ssk->rx_ring.buffer = kmalloc(
-			sizeof *ssk->rx_ring.buffer * SDP_RX_SIZE, GFP_KERNEL);
-	if (!ssk->rx_ring.buffer) {
-		sdp_warn(ssk->socket,
-			"Unable to allocate RX Ring size %zd.\n",
-			 sizeof(*ssk->rx_ring.buffer) * SDP_RX_SIZE);
+	ssk->rx_ring.buffer = malloc(sizeof(*ssk->rx_ring.buffer) * SDP_RX_SIZE,
+	    M_SDP, M_WAITOK);
 
-		return -ENOMEM;
-	}
-
 	rx_cq = ib_create_cq(device, sdp_rx_irq, sdp_rx_cq_event_handler,
-			  ssk->socket, SDP_RX_SIZE, 0);
-
+	    ssk, SDP_RX_SIZE, 0);
 	if (IS_ERR(rx_cq)) {
 		rc = PTR_ERR(rx_cq);
 		sdp_warn(ssk->socket, "Unable to allocate RX CQ: %d.\n", rc);
 		goto err_cq;
 	}
 
 	sdp_sk(ssk->socket)->rx_ring.cq = rx_cq;
 	sdp_arm_rx_cq(ssk);
 
 	return 0;
 
 err_cq:
-	kfree(ssk->rx_ring.buffer);
+	free(ssk->rx_ring.buffer, M_SDP);
 	ssk->rx_ring.buffer = NULL;
 	return rc;
 }
 
 void
 sdp_rx_ring_destroy(struct sdp_sock *ssk)
 {
 
 	cancel_work_sync(&ssk->rx_comp_work);
 	rx_ring_destroy_lock(&ssk->rx_ring);
 
 	if (ssk->rx_ring.buffer) {
 		sdp_rx_ring_purge(ssk);
-
-		kfree(ssk->rx_ring.buffer);
+		free(ssk->rx_ring.buffer, M_SDP);
 		ssk->rx_ring.buffer = NULL;
 	}
 
 	if (ssk->rx_ring.cq) {
 		if (ib_destroy_cq(ssk->rx_ring.cq)) {
 			sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
 				ssk->rx_ring.cq);
 		} else {
 			ssk->rx_ring.cq = NULL;
 		}
 	}
 
 	WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring));
 }
Index: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c	(revision 303517)
@@ -1,490 +1,480 @@
 /*
  * Copyright (c) 2009 Mellanox Technologies Ltd.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "sdp.h"
 
 #define sdp_cnt(var) do { (var)++; } while (0)
 
 SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0,
 		"Total number of keepalive probes sent.");
 
 static int sdp_process_tx_cq(struct sdp_sock *ssk);
 static void sdp_poll_tx_timeout(void *data);
 
 int
 sdp_xmit_poll(struct sdp_sock *ssk, int force)
 {
 	int wc_processed = 0;
 
 	SDP_WLOCK_ASSERT(ssk);
 	sdp_prf(ssk->socket, NULL, "%s", __func__);
 
 	/* If we don't have a pending timer, set one up to catch our recent
 	   post in case the interface becomes idle */
 	if (!callout_pending(&ssk->tx_ring.timer))
 		callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT,
 		    sdp_poll_tx_timeout, ssk);
 
 	/* Poll the CQ every SDP_TX_POLL_MODER packets */
 	if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0)
 		wc_processed = sdp_process_tx_cq(ssk);
 
 	return wc_processed;
 }
 
 void
 sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb)
 {
 	struct sdp_buf *tx_req;
 	struct sdp_bsdh *h;
 	unsigned long mseq;
 	struct ib_device *dev;
 	struct ib_send_wr *bad_wr;
 	struct ib_sge ibsge[SDP_MAX_SEND_SGES];
 	struct ib_sge *sge;
 	struct ib_send_wr tx_wr = { NULL };
 	int i, rc;
 	u64 addr;
 
 	SDPSTATS_COUNTER_MID_INC(post_send, h->mid);
 	SDPSTATS_HIST(send_size, mb->len);
 
 	if (!ssk->qp_active) {
 		m_freem(mb);
 		return;
 	}
 
 	mseq = ring_head(ssk->tx_ring);
 	h = mtod(mb, struct sdp_bsdh *);
 	ssk->tx_packets++;
 	ssk->tx_bytes += mb->m_pkthdr.len;
 
 #ifdef SDP_ZCOPY
 	if (unlikely(h->mid == SDP_MID_SRCAVAIL)) {
 		struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(mb);
 		if (ssk->tx_sa != tx_sa) {
 			sdp_dbg_data(ssk->socket, "SrcAvail cancelled "
 					"before being sent!\n");
 			WARN_ON(1);
 			m_freem(mb);
 			return;
 		}
 		TX_SRCAVAIL_STATE(mb)->mseq = mseq;
 	}
 #endif
 
 	if (unlikely(mb->m_flags & M_URG))
 		h->flags = SDP_OOB_PRES | SDP_OOB_PEND;
 	else
 		h->flags = 0;
 
 	mb->m_flags |= M_RDONLY; /* Don't allow compression once sent. */
 	h->bufs = htons(rx_ring_posted(ssk));
 	h->len = htonl(mb->m_pkthdr.len);
 	h->mseq = htonl(mseq);
 	h->mseq_ack = htonl(mseq_ack(ssk));
 
 	sdp_prf1(ssk->socket, mb, "TX: %s bufs: %d mseq:%ld ack:%d",
 			mid2str(h->mid), rx_ring_posted(ssk), mseq,
 			ntohl(h->mseq_ack));
 
 	SDP_DUMP_PACKET(ssk->socket, "TX", mb, h);
 
 	tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)];
 	tx_req->mb = mb;
 	dev = ssk->ib_device;
 	sge = &ibsge[0];
 	for (i = 0;  mb != NULL; i++, mb = mb->m_next, sge++) {
 		addr = ib_dma_map_single(dev, mb->m_data, mb->m_len,
 		    DMA_TO_DEVICE);
 		/* TODO: proper error handling */
 		BUG_ON(ib_dma_mapping_error(dev, addr));
 		BUG_ON(i >= SDP_MAX_SEND_SGES);
 		tx_req->mapping[i] = addr;
 		sge->addr = addr;
 		sge->length = mb->m_len;
 		sge->lkey = ssk->sdp_dev->mr->lkey;
 	}
 	tx_wr.next = NULL;
 	tx_wr.wr_id = mseq | SDP_OP_SEND;
 	tx_wr.sg_list = ibsge;
 	tx_wr.num_sge = i;
 	tx_wr.opcode = IB_WR_SEND;
 	tx_wr.send_flags = IB_SEND_SIGNALED;
 	if (unlikely(tx_req->mb->m_flags & M_URG))
 		tx_wr.send_flags |= IB_SEND_SOLICITED;
 
 	rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr);
 	if (unlikely(rc)) {
 		sdp_dbg(ssk->socket,
 				"ib_post_send failed with status %d.\n", rc);
 
 		sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE);
 
 		sdp_notify(ssk, ECONNRESET);
 		m_freem(tx_req->mb);
 		return;
 	}
 
 	atomic_inc(&ssk->tx_ring.head);
 	atomic_dec(&ssk->tx_ring.credits);
 	atomic_set(&ssk->remote_credits, rx_ring_posted(ssk));
 
 	return;
 }
 
 static struct mbuf *
 sdp_send_completion(struct sdp_sock *ssk, int mseq)
 {
 	struct ib_device *dev;
 	struct sdp_buf *tx_req;
 	struct mbuf *mb = NULL;
 	struct sdp_tx_ring *tx_ring = &ssk->tx_ring;
 
 	if (unlikely(mseq != ring_tail(*tx_ring))) {
 		printk(KERN_WARNING "Bogus send completion id %d tail %d\n",
 			mseq, ring_tail(*tx_ring));
 		goto out;
 	}
 
 	dev = ssk->ib_device;
 	tx_req = &tx_ring->buffer[mseq & (SDP_TX_SIZE - 1)];
 	mb = tx_req->mb;
 	sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE);
 
 #ifdef SDP_ZCOPY
 	/* TODO: AIO and real zcopy code; add their context support here */
 	if (BZCOPY_STATE(mb))
 		BZCOPY_STATE(mb)->busy--;
 #endif
 
 	atomic_inc(&tx_ring->tail);
 
 out:
 	return mb;
 }
 
 static int
 sdp_handle_send_comp(struct sdp_sock *ssk, struct ib_wc *wc)
 {
 	struct mbuf *mb = NULL;
 	struct sdp_bsdh *h;
 
 	if (unlikely(wc->status)) {
 		if (wc->status != IB_WC_WR_FLUSH_ERR) {
 			sdp_prf(ssk->socket, mb, "Send completion with error. "
 				"Status %d", wc->status);
 			sdp_dbg_data(ssk->socket, "Send completion with error. "
 				"Status %d\n", wc->status);
 			sdp_notify(ssk, ECONNRESET);
 		}
 	}
 
 	mb = sdp_send_completion(ssk, wc->wr_id);
 	if (unlikely(!mb))
 		return -1;
 
 	h = mtod(mb, struct sdp_bsdh *);
 	sdp_prf1(ssk->socket, mb, "tx completion. mseq:%d", ntohl(h->mseq));
 	sdp_dbg(ssk->socket, "tx completion. %p %d mseq:%d",
 	    mb, mb->m_pkthdr.len, ntohl(h->mseq));
 	m_freem(mb);
 
 	return 0;
 }
 
 static inline void
 sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
 {
 
 	if (likely(wc->wr_id & SDP_OP_SEND)) {
 		sdp_handle_send_comp(ssk, wc);
 		return;
 	}
 
 #ifdef SDP_ZCOPY
 	if (wc->wr_id & SDP_OP_RDMA) {
 		/* TODO: handle failed RDMA read cqe */
 
 		sdp_dbg_data(ssk->socket,
 	 	    "TX comp: RDMA read. status: %d\n", wc->status);
 		sdp_prf1(sk, NULL, "TX comp: RDMA read");
 
 		if (!ssk->tx_ring.rdma_inflight) {
 			sdp_warn(ssk->socket, "ERROR: unexpected RDMA read\n");
 			return;
 		}
 
 		if (!ssk->tx_ring.rdma_inflight->busy) {
 			sdp_warn(ssk->socket,
 			    "ERROR: too many RDMA read completions\n");
 			return;
 		}
 
 		/* Only last RDMA read WR is signalled. Order is guaranteed -
 		 * therefore if Last RDMA read WR is completed - all other
 		 * have, too */
 		ssk->tx_ring.rdma_inflight->busy = 0;
 		sowwakeup(ssk->socket);
 		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
 		return;
 	}
 #endif
 
 	/* Keepalive probe sent cleanup */
 	sdp_cnt(sdp_keepalive_probes_sent);
 
 	if (likely(!wc->status))
 		return;
 
 	sdp_dbg(ssk->socket, " %s consumes KEEPALIVE status %d\n",
 			__func__, wc->status);
 
 	if (wc->status == IB_WC_WR_FLUSH_ERR)
 		return;
 
 	sdp_notify(ssk, ECONNRESET);
 }
 
 static int
 sdp_process_tx_cq(struct sdp_sock *ssk)
 {
 	struct ib_wc ibwc[SDP_NUM_WC];
 	int n, i;
 	int wc_processed = 0;
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	if (!ssk->tx_ring.cq) {
 		sdp_dbg(ssk->socket, "tx irq on destroyed tx_cq\n");
 		return 0;
 	}
 
 	do {
 		n = ib_poll_cq(ssk->tx_ring.cq, SDP_NUM_WC, ibwc);
 		for (i = 0; i < n; ++i) {
 			sdp_process_tx_wc(ssk, ibwc + i);
 			wc_processed++;
 		}
 	} while (n == SDP_NUM_WC);
 
 	if (wc_processed) {
 		sdp_post_sends(ssk, M_NOWAIT);
 		sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", 
 				(u32) tx_ring_posted(ssk));
 		sowwakeup(ssk->socket);
 	}
 
 	return wc_processed;
 }
 
 static void
 sdp_poll_tx(struct sdp_sock *ssk)
 {
 	struct socket *sk = ssk->socket;
 	u32 inflight, wc_processed;
 
 	sdp_prf1(ssk->socket, NULL, "TX timeout: inflight=%d, head=%d tail=%d", 
 		(u32) tx_ring_posted(ssk),
 		ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring));
 
 	if (unlikely(ssk->state == TCPS_CLOSED)) {
 		sdp_warn(sk, "Socket is closed\n");
 		goto out;
 	}
 
 	wc_processed = sdp_process_tx_cq(ssk);
 	if (!wc_processed)
 		SDPSTATS_COUNTER_INC(tx_poll_miss);
 	else
 		SDPSTATS_COUNTER_INC(tx_poll_hit);
 
 	inflight = (u32) tx_ring_posted(ssk);
 	sdp_prf1(ssk->socket, NULL, "finished tx processing. inflight = %d",
 	    inflight);
 
 	/* If there are still packets in flight and the timer has not already
 	 * been scheduled by the Tx routine then schedule it here to guarantee
 	 * completion processing of these packets */
 	if (inflight)
 		callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT,
 		    sdp_poll_tx_timeout, ssk);
 out:
 #ifdef SDP_ZCOPY
 	if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) {
 		sdp_prf1(sk, NULL, "RDMA is inflight - arming irq");
 		sdp_arm_tx_cq(ssk);
 	}
 #endif
 	return;
 }
 
 static void
 sdp_poll_tx_timeout(void *data)
 {
 	struct sdp_sock *ssk = (struct sdp_sock *)data;
 
 	if (!callout_active(&ssk->tx_ring.timer))
 		return;
 	callout_deactivate(&ssk->tx_ring.timer);
 	sdp_poll_tx(ssk);
 }
 
 static void
 sdp_tx_irq(struct ib_cq *cq, void *cq_context)
 {
 	struct sdp_sock *ssk;
 
 	ssk = cq_context;
 	sdp_prf1(ssk->socket, NULL, "tx irq");
 	sdp_dbg_data(ssk->socket, "Got tx comp interrupt\n");
 	SDPSTATS_COUNTER_INC(tx_int_count);
 	SDP_WLOCK(ssk);
 	sdp_poll_tx(ssk);
 	SDP_WUNLOCK(ssk);
 }
 
 static
 void sdp_tx_ring_purge(struct sdp_sock *ssk)
 {
 	while (tx_ring_posted(ssk)) {
 		struct mbuf *mb;
 		mb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring));
 		if (!mb)
 			break;
 		m_freem(mb);
 	}
 }
 
 void
 sdp_post_keepalive(struct sdp_sock *ssk)
 {
 	int rc;
 	struct ib_send_wr wr, *bad_wr;
 
 	sdp_dbg(ssk->socket, "%s\n", __func__);
 
 	memset(&wr, 0, sizeof(wr));
 
 	wr.next    = NULL;
 	wr.wr_id   = 0;
 	wr.sg_list = NULL;
 	wr.num_sge = 0;
 	wr.opcode  = IB_WR_RDMA_WRITE;
 
 	rc = ib_post_send(ssk->qp, &wr, &bad_wr);
 	if (rc) {
 		sdp_dbg(ssk->socket,
 			"ib_post_keepalive failed with status %d.\n", rc);
 		sdp_notify(ssk, ECONNRESET);
 	}
 
 	sdp_cnt(sdp_keepalive_probes_sent);
 }
 
 static void
 sdp_tx_cq_event_handler(struct ib_event *event, void *data)
 {
 }
 
 int
 sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
 {
 	struct ib_cq *tx_cq;
 	int rc = 0;
 
 	sdp_dbg(ssk->socket, "tx ring create\n");
 	callout_init_rw(&ssk->tx_ring.timer, &ssk->lock, 0);
 	callout_init_rw(&ssk->nagle_timer, &ssk->lock, 0);
 	atomic_set(&ssk->tx_ring.head, 1);
 	atomic_set(&ssk->tx_ring.tail, 1);
 
-	ssk->tx_ring.buffer = kzalloc(
-			sizeof *ssk->tx_ring.buffer * SDP_TX_SIZE, GFP_KERNEL);
-	if (!ssk->tx_ring.buffer) {
-		rc = -ENOMEM;
-		sdp_warn(ssk->socket, "Can't allocate TX Ring size %zd.\n",
-			 sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE);
+	ssk->tx_ring.buffer = malloc(sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE,
+	    M_SDP, M_WAITOK);
 
-		goto out;
-	}
-
 	tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler,
 			  ssk, SDP_TX_SIZE, 0);
-
 	if (IS_ERR(tx_cq)) {
 		rc = PTR_ERR(tx_cq);
 		sdp_warn(ssk->socket, "Unable to allocate TX CQ: %d.\n", rc);
 		goto err_cq;
 	}
 	ssk->tx_ring.cq = tx_cq;
 	ssk->tx_ring.poll_cnt = 0;
 	sdp_arm_tx_cq(ssk);
 
 	return 0;
 
 err_cq:
-	kfree(ssk->tx_ring.buffer);
+	free(ssk->tx_ring.buffer, M_SDP);
 	ssk->tx_ring.buffer = NULL;
-out:
 	return rc;
 }
 
 void
 sdp_tx_ring_destroy(struct sdp_sock *ssk)
 {
 
 	sdp_dbg(ssk->socket, "tx ring destroy\n");
 	SDP_WLOCK(ssk);
 	callout_stop(&ssk->tx_ring.timer);
 	callout_stop(&ssk->nagle_timer);
 	SDP_WUNLOCK(ssk);
 	callout_drain(&ssk->tx_ring.timer);
 	callout_drain(&ssk->nagle_timer);
 
 	if (ssk->tx_ring.buffer) {
 		sdp_tx_ring_purge(ssk);
-
-		kfree(ssk->tx_ring.buffer);
+		free(ssk->tx_ring.buffer, M_SDP);
 		ssk->tx_ring.buffer = NULL;
 	}
 
 	if (ssk->tx_ring.cq) {
 		if (ib_destroy_cq(ssk->tx_ring.cq)) {
 			sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
 					ssk->tx_ring.cq);
 		} else {
 			ssk->tx_ring.cq = NULL;
 		}
 	}
 
 	WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring));
 }
Index: user/alc/PQ_LAUNDRY/tests/sys/aio/aio_test.c
===================================================================
--- user/alc/PQ_LAUNDRY/tests/sys/aio/aio_test.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/tests/sys/aio/aio_test.c	(revision 303517)
@@ -1,942 +1,1025 @@
 /*-
  * Copyright (c) 2004 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Regression test to do some very basic AIO exercising on several types of
  * file descriptors.  Currently, the tests consist of initializing a fixed
  * size buffer with pseudo-random data, writing it to one fd using AIO, then
  * reading it from a second descriptor using AIO.  For some targets, the same
  * fd is used for write and read (i.e., file, md device), but for others the
  * operation is performed on a peer (pty, socket, fifo, etc).  A timeout is
  * initiated to detect undo blocking.  This test does not attempt to exercise
  * error cases or more subtle asynchronous behavior, just make sure that the
  * basic operations work on some basic object types.
  */
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/mdioctl.h>
 
 #include <aio.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libutil.h>
 #include <limits.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <termios.h>
 #include <unistd.h>
 
 #include <atf-c.h>
 
 #include "freebsd_test_suite/macros.h"
 #include "local.h"
 
 #define	PATH_TEMPLATE	"aio.XXXXXXXXXX"
 
 /*
  * GLOBAL_MAX sets the largest usable buffer size to be read and written, as
  * it sizes ac_buffer in the aio_context structure.  It is also the default
  * size for file I/O.  For other types, we use smaller blocks or we risk
  * blocking (and we run in a single process/thread so that would be bad).
  */
 #define	GLOBAL_MAX	16384
 
 #define	BUFFER_MAX	GLOBAL_MAX
 struct aio_context {
 	int		 ac_read_fd, ac_write_fd;
 	long		 ac_seed;
 	char		 ac_buffer[GLOBAL_MAX];
 	int		 ac_buflen;
 	int		 ac_seconds;
 	void		 (*ac_cleanup)(void *arg);
 	void		*ac_cleanup_arg;
 };
 
 static int	aio_timedout;
 
 /*
  * Each test run specifies a timeout in seconds.  Use the somewhat obsoleted
  * signal(3) and alarm(3) APIs to set this up.
  */
 static void
 aio_timeout_signal(int sig __unused)
 {
 
 	aio_timedout = 1;
 }
 
 static void
 aio_timeout_start(int seconds)
 {
 
 	aio_timedout = 0;
 	ATF_REQUIRE_MSG(signal(SIGALRM, aio_timeout_signal) != SIG_ERR,
 	    "failed to set SIGALRM handler: %s", strerror(errno));
 	alarm(seconds);
 }
 
 static void
 aio_timeout_stop(void)
 {
 
 	ATF_REQUIRE_MSG(signal(SIGALRM, NULL) != SIG_ERR,
 	    "failed to reset SIGALRM handler to default: %s", strerror(errno));
 	alarm(0);
 }
 
 /*
  * Fill a buffer given a seed that can be fed into srandom() to initialize
  * the PRNG in a repeatable manner.
  */
 static void
 aio_fill_buffer(char *buffer, int len, long seed)
 {
 	char ch;
 	int i;
 
 	srandom(seed);
 	for (i = 0; i < len; i++) {
 		ch = random() & 0xff;
 		buffer[i] = ch;
 	}
 }
 
 /*
  * Test that a buffer matches a given seed.  See aio_fill_buffer().  Return
  * (1) on a match, (0) on a mismatch.
  */
 static int
 aio_test_buffer(char *buffer, int len, long seed)
 {
 	char ch;
 	int i;
 
 	srandom(seed);
 	for (i = 0; i < len; i++) {
 		ch = random() & 0xff;
 		if (buffer[i] != ch)
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * Initialize a testing context given the file descriptors provided by the
  * test setup.
  */
 static void
 aio_context_init(struct aio_context *ac, int read_fd,
     int write_fd, int buflen, int seconds, void (*cleanup)(void *),
     void *cleanup_arg)
 {
 
 	ATF_REQUIRE_MSG(buflen <= BUFFER_MAX,
 	    "aio_context_init: buffer too large (%d > %d)",
 	    buflen, BUFFER_MAX);
 	bzero(ac, sizeof(*ac));
 	ac->ac_read_fd = read_fd;
 	ac->ac_write_fd = write_fd;
 	ac->ac_buflen = buflen;
 	srandomdev();
 	ac->ac_seed = random();
 	aio_fill_buffer(ac->ac_buffer, buflen, ac->ac_seed);
 	ATF_REQUIRE_MSG(aio_test_buffer(ac->ac_buffer, buflen,
 	    ac->ac_seed) != 0, "aio_test_buffer: internal error");
 	ac->ac_seconds = seconds;
 	ac->ac_cleanup = cleanup;
 	ac->ac_cleanup_arg = cleanup_arg;
 }
 
 /*
  * Each tester can register a callback to clean up in the event the test
  * fails.  Preserve the value of errno so that subsequent calls to errx()
  * work properly.
  */
 static void
 aio_cleanup(struct aio_context *ac)
 {
 	int error;
 
 	if (ac->ac_cleanup == NULL)
 		return;
 	error = errno;
 	(ac->ac_cleanup)(ac->ac_cleanup_arg);
 	errno = error;
 }
 
 /*
  * Perform a simple write test of our initialized data buffer to the provided
  * file descriptor.
  */
 static void
 aio_write_test(struct aio_context *ac)
 {
 	struct aiocb aio, *aiop;
 	ssize_t len;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	bzero(&aio, sizeof(aio));
 	aio.aio_buf = ac->ac_buffer;
 	aio.aio_nbytes = ac->ac_buflen;
 	aio.aio_fildes = ac->ac_write_fd;
 	aio.aio_offset = 0;
 
 	aio_timeout_start(ac->ac_seconds);
 
 	if (aio_write(&aio) < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_write timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_write failed: %s", strerror(errno));
 	}
 
 	len = aio_waitcomplete(&aiop, NULL);
 	if (len < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_waitcomplete timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete failed: %s", strerror(errno));
 	}
 
 	aio_timeout_stop();
 
 	if (len != ac->ac_buflen) {
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete short write (%jd)",
 		    (intmax_t)len);
 	}
 }
 
 /*
  * Perform a simple read test of our initialized data buffer from the
  * provided file descriptor.
  */
 static void
 aio_read_test(struct aio_context *ac)
 {
 	struct aiocb aio, *aiop;
 	ssize_t len;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	bzero(ac->ac_buffer, ac->ac_buflen);
 	bzero(&aio, sizeof(aio));
 	aio.aio_buf = ac->ac_buffer;
 	aio.aio_nbytes = ac->ac_buflen;
 	aio.aio_fildes = ac->ac_read_fd;
 	aio.aio_offset = 0;
 
 	aio_timeout_start(ac->ac_seconds);
 
 	if (aio_read(&aio) < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_write timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_read failed: %s", strerror(errno));
 	}
 
 	len = aio_waitcomplete(&aiop, NULL);
 	if (len < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_waitcomplete timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete failed: %s", strerror(errno));
 	}
 
 	aio_timeout_stop();
 
 	if (len != ac->ac_buflen) {
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete short read (%jd)",
 		    (intmax_t)len);
 	}
 
 	if (aio_test_buffer(ac->ac_buffer, ac->ac_buflen, ac->ac_seed) == 0) {
 		aio_cleanup(ac);
 		atf_tc_fail("buffer mismatched");
 	}
 }
 
 /*
  * Series of type-specific tests for AIO.  For now, we just make sure we can
  * issue a write and then a read to each type.  We assume that once a write
  * is issued, a read can follow.
  */
 
 /*
  * Test with a classic file.  Assumes we can create a moderate size temporary
  * file.
  */
 struct aio_file_arg {
 	int	 afa_fd;
 	char	*afa_pathname;
 };
 
 static void
 aio_file_cleanup(void *arg)
 {
 	struct aio_file_arg *afa;
 
 	afa = arg;
 	close(afa->afa_fd);
 	unlink(afa->afa_pathname);
 }
 
 #define	FILE_LEN	GLOBAL_MAX
 #define	FILE_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_file_test);
 ATF_TC_BODY(aio_file_test, tc)
 {
 	char pathname[PATH_MAX];
 	struct aio_file_arg arg;
 	struct aio_context ac;
 	int fd;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	strcpy(pathname, PATH_TEMPLATE);
 	fd = mkstemp(pathname);
 	ATF_REQUIRE_MSG(fd != -1, "mkstemp failed: %s", strerror(errno));
 
 	arg.afa_fd = fd;
 	arg.afa_pathname = pathname;
 
 	aio_context_init(&ac, fd, fd, FILE_LEN,
 	    FILE_TIMEOUT, aio_file_cleanup, &arg);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_file_cleanup(&arg);
 }
 
 struct aio_fifo_arg {
 	int	 afa_read_fd;
 	int	 afa_write_fd;
 	char	*afa_pathname;
 };
 
 static void
 aio_fifo_cleanup(void *arg)
 {
 	struct aio_fifo_arg *afa;
 
 	afa = arg;
 	if (afa->afa_read_fd != -1)
 		close(afa->afa_read_fd);
 	if (afa->afa_write_fd != -1)
 		close(afa->afa_write_fd);
 	unlink(afa->afa_pathname);
 }
 
 #define	FIFO_LEN	256
 #define	FIFO_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_fifo_test);
 ATF_TC_BODY(aio_fifo_test, tc)
 {
 	int error, read_fd = -1, write_fd = -1;
 	struct aio_fifo_arg arg;
 	char pathname[PATH_MAX];
 	struct aio_context ac;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	/*
 	 * In theory, mkstemp() can return a name that is then collided with.
 	 * Because this is a regression test, we treat that as a test failure
 	 * rather than retrying.
 	 */
 	strcpy(pathname, PATH_TEMPLATE);
 	ATF_REQUIRE_MSG(mkstemp(pathname) != -1,
 	    "mkstemp failed: %s", strerror(errno));
 	ATF_REQUIRE_MSG(unlink(pathname) == 0,
 	    "unlink failed: %s", strerror(errno));
 	ATF_REQUIRE_MSG(mkfifo(pathname, 0600) != -1,
 	    "mkfifo failed: %s", strerror(errno));
 	arg.afa_pathname = pathname;
 	arg.afa_read_fd = -1;
 	arg.afa_write_fd = -1;
 
 	read_fd = open(pathname, O_RDONLY | O_NONBLOCK);
 	if (read_fd == -1) {
 		error = errno;
 		aio_fifo_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("read_fd open failed: %s",
 		    strerror(errno));
 	}
 	arg.afa_read_fd = read_fd;
 
 	write_fd = open(pathname, O_WRONLY);
 	if (write_fd == -1) {
 		error = errno;
 		aio_fifo_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("write_fd open failed: %s",
 		    strerror(errno));
 	}
 	arg.afa_write_fd = write_fd;
 
 	aio_context_init(&ac, read_fd, write_fd, FIFO_LEN,
 	    FIFO_TIMEOUT, aio_fifo_cleanup, &arg);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_fifo_cleanup(&arg);
 }
 
 struct aio_unix_socketpair_arg {
 	int	asa_sockets[2];
 };
 
 static void
 aio_unix_socketpair_cleanup(void *arg)
 {
 	struct aio_unix_socketpair_arg *asa;
 
 	asa = arg;
 	close(asa->asa_sockets[0]);
 	close(asa->asa_sockets[1]);
 }
 
 #define	UNIX_SOCKETPAIR_LEN	256
 #define	UNIX_SOCKETPAIR_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_unix_socketpair_test);
 ATF_TC_BODY(aio_unix_socketpair_test, tc)
 {
 	struct aio_unix_socketpair_arg arg;
 	struct aio_context ac;
 	struct rusage ru_before, ru_after;
 	int sockets[2];
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	ATF_REQUIRE_MSG(socketpair(PF_UNIX, SOCK_STREAM, 0, sockets) != -1,
 	    "socketpair failed: %s", strerror(errno));
 
 	arg.asa_sockets[0] = sockets[0];
 	arg.asa_sockets[1] = sockets[1];
 	aio_context_init(&ac, sockets[0],
 	    sockets[1], UNIX_SOCKETPAIR_LEN, UNIX_SOCKETPAIR_TIMEOUT,
 	    aio_unix_socketpair_cleanup, &arg);
 	ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_before) != -1,
 	    "getrusage failed: %s", strerror(errno));
 	aio_write_test(&ac);
 	ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_after) != -1,
 	    "getrusage failed: %s", strerror(errno));
 	ATF_REQUIRE(ru_after.ru_msgsnd == ru_before.ru_msgsnd + 1);
 	ru_before = ru_after;
 	aio_read_test(&ac);
 	ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_after) != -1,
 	    "getrusage failed: %s", strerror(errno));
 	ATF_REQUIRE(ru_after.ru_msgrcv == ru_before.ru_msgrcv + 1);
 
 	aio_unix_socketpair_cleanup(&arg);
 }
 
 struct aio_pty_arg {
 	int	apa_read_fd;
 	int	apa_write_fd;
 };
 
 static void
 aio_pty_cleanup(void *arg)
 {
 	struct aio_pty_arg *apa;
 
 	apa = arg;
 	close(apa->apa_read_fd);
 	close(apa->apa_write_fd);
 };
 
 #define	PTY_LEN		256
 #define	PTY_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_pty_test);
 ATF_TC_BODY(aio_pty_test, tc)
 {
 	struct aio_pty_arg arg;
 	struct aio_context ac;
 	int read_fd, write_fd;
 	struct termios ts;
 	int error;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	ATF_REQUIRE_MSG(openpty(&read_fd, &write_fd, NULL, NULL, NULL) == 0,
 	    "openpty failed: %s", strerror(errno));
 
 	arg.apa_read_fd = read_fd;
 	arg.apa_write_fd = write_fd;
 
 	if (tcgetattr(write_fd, &ts) < 0) {
 		error = errno;
 		aio_pty_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("tcgetattr failed: %s", strerror(errno));
 	}
 	cfmakeraw(&ts);
 	if (tcsetattr(write_fd, TCSANOW, &ts) < 0) {
 		error = errno;
 		aio_pty_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("tcsetattr failed: %s", strerror(errno));
 	}
 	aio_context_init(&ac, read_fd, write_fd, PTY_LEN,
 	    PTY_TIMEOUT, aio_pty_cleanup, &arg);
 
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_pty_cleanup(&arg);
 }
 
 static void
 aio_pipe_cleanup(void *arg)
 {
 	int *pipes = arg;
 
 	close(pipes[0]);
 	close(pipes[1]);
 }
 
 #define	PIPE_LEN	256
 #define	PIPE_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_pipe_test);
 ATF_TC_BODY(aio_pipe_test, tc)
 {
 	struct aio_context ac;
 	int pipes[2];
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	ATF_REQUIRE_MSG(pipe(pipes) != -1,
 	    "pipe failed: %s", strerror(errno));
 
 	aio_context_init(&ac, pipes[0], pipes[1], PIPE_LEN,
 	    PIPE_TIMEOUT, aio_pipe_cleanup, pipes);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_pipe_cleanup(pipes);
 }
 
 struct aio_md_arg {
 	int	ama_mdctl_fd;
 	int	ama_unit;
 	int	ama_fd;
 };
 
 static void
 aio_md_cleanup(void *arg)
 {
 	struct aio_md_arg *ama;
 	struct md_ioctl mdio;
 	int error;
 
 	ama = arg;
 
 	if (ama->ama_fd != -1)
 		close(ama->ama_fd);
 
 	if (ama->ama_unit != -1) {
 		bzero(&mdio, sizeof(mdio));
 		mdio.md_version = MDIOVERSION;
 		mdio.md_unit = ama->ama_unit;
 		if (ioctl(ama->ama_mdctl_fd, MDIOCDETACH, &mdio) == -1) {
 			error = errno;
 			close(ama->ama_mdctl_fd);
 			errno = error;
 			atf_tc_fail("ioctl MDIOCDETACH failed: %s",
 			    strerror(errno));
 		}
 	}
 
 	close(ama->ama_mdctl_fd);
 }
 
 #define	MD_LEN		GLOBAL_MAX
 #define	MD_TIMEOUT	30
 ATF_TC(aio_md_test);
 ATF_TC_HEAD(aio_md_test, tc)
 {
 
 	atf_tc_set_md_var(tc, "require.user", "root");
 }
 ATF_TC_BODY(aio_md_test, tc)
 {
 	int error, fd, mdctl_fd, unit;
 	char pathname[PATH_MAX];
 	struct aio_md_arg arg;
 	struct aio_context ac;
 	struct md_ioctl mdio;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	mdctl_fd = open("/dev/" MDCTL_NAME, O_RDWR, 0);
 	ATF_REQUIRE_MSG(mdctl_fd != -1,
 	    "opening /dev/%s failed: %s", MDCTL_NAME, strerror(errno));
 
 	bzero(&mdio, sizeof(mdio));
 	mdio.md_version = MDIOVERSION;
 	mdio.md_type = MD_MALLOC;
 	mdio.md_options = MD_AUTOUNIT | MD_COMPRESS;
 	mdio.md_mediasize = GLOBAL_MAX;
 	mdio.md_sectorsize = 512;
 
 	arg.ama_mdctl_fd = mdctl_fd;
 	arg.ama_unit = -1;
 	arg.ama_fd = -1;
 	if (ioctl(mdctl_fd, MDIOCATTACH, &mdio) < 0) {
 		error = errno;
 		aio_md_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("ioctl MDIOCATTACH failed: %s", strerror(errno));
 	}
 
 	arg.ama_unit = unit = mdio.md_unit;
 	snprintf(pathname, PATH_MAX, "/dev/md%d", unit);
 	fd = open(pathname, O_RDWR);
 	ATF_REQUIRE_MSG(fd != -1,
 	    "opening %s failed: %s", pathname, strerror(errno));
 	arg.ama_fd = fd;
 
 	aio_context_init(&ac, fd, fd, MD_LEN, MD_TIMEOUT,
 	    aio_md_cleanup, &arg);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_md_cleanup(&arg);
 }
 
 ATF_TC_WITHOUT_HEAD(aio_large_read_test);
 ATF_TC_BODY(aio_large_read_test, tc)
 {
 	char pathname[PATH_MAX];
 	struct aiocb cb, *cbp;
 	ssize_t nread;
 	size_t len;
 	int fd;
 #ifdef __LP64__
 	int clamped;
 #endif
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 #ifdef __LP64__
 	len = sizeof(clamped);
 	if (sysctlbyname("debug.iosize_max_clamp", &clamped, &len, NULL, 0) ==
 	    -1)
 		atf_libc_error(errno, "Failed to read debug.iosize_max_clamp");
 #endif
 
 	/* Determine the maximum supported read(2) size. */
 	len = SSIZE_MAX;
 #ifdef __LP64__
 	if (clamped)
 		len = INT_MAX;
 #endif
 
 	strcpy(pathname, PATH_TEMPLATE);
 	fd = mkstemp(pathname);
 	ATF_REQUIRE_MSG(fd != -1, "mkstemp failed: %s", strerror(errno));
 
 	unlink(pathname);
 
 	memset(&cb, 0, sizeof(cb));
 	cb.aio_nbytes = len;
 	cb.aio_fildes = fd;
 	cb.aio_buf = NULL;
 	if (aio_read(&cb) == -1)
 		atf_tc_fail("aio_read() of maximum read size failed: %s",
 		    strerror(errno));
 
 	nread = aio_waitcomplete(&cbp, NULL);
 	if (nread == -1)
 		atf_tc_fail("aio_waitcomplete() failed: %s", strerror(errno));
 	if (nread != 0)
 		atf_tc_fail("aio_read() from empty file returned data: %zd",
 		    nread);
 
 	memset(&cb, 0, sizeof(cb));
 	cb.aio_nbytes = len + 1;
 	cb.aio_fildes = fd;
 	cb.aio_buf = NULL;
 	if (aio_read(&cb) == -1) {
 		if (errno == EINVAL)
 			goto finished;
 		atf_tc_fail("aio_read() of too large read size failed: %s",
 		    strerror(errno));
 	}
 
 	nread = aio_waitcomplete(&cbp, NULL);
 	if (nread == -1) {
 		if (errno == EINVAL)
 			goto finished;
 		atf_tc_fail("aio_waitcomplete() failed: %s", strerror(errno));
 	}
 	atf_tc_fail("aio_read() of too large read size returned: %zd", nread);
 
 finished:
 	close(fd);
 }
 
 /*
  * This tests for a bug where arriving socket data can wakeup multiple
  * AIO read requests resulting in an uncancellable request.
  */
 ATF_TC_WITHOUT_HEAD(aio_socket_two_reads);
 ATF_TC_BODY(aio_socket_two_reads, tc)
 {
 	struct ioreq {
 		struct aiocb iocb;
 		char buffer[1024];
 	} ioreq[2];
 	struct aiocb *iocb;
 	unsigned i;
 	int s[2];
 	char c;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 #if __FreeBSD_version < 1100101
 	aft_tc_skip("kernel version %d is too old (%d required)",
 	    __FreeBSD_version, 1100101);
 #endif
 
 	ATF_REQUIRE(socketpair(PF_UNIX, SOCK_STREAM, 0, s) != -1);
 
 	/* Queue two read requests. */
 	memset(&ioreq, 0, sizeof(ioreq));
 	for (i = 0; i < nitems(ioreq); i++) {
 		ioreq[i].iocb.aio_nbytes = sizeof(ioreq[i].buffer);
 		ioreq[i].iocb.aio_fildes = s[0];
 		ioreq[i].iocb.aio_buf = ioreq[i].buffer;
 		ATF_REQUIRE(aio_read(&ioreq[i].iocb) == 0);
 	}
 
 	/* Send a single byte.  This should complete one request. */
 	c = 0xc3;
 	ATF_REQUIRE(write(s[1], &c, sizeof(c)) == 1);
 
 	ATF_REQUIRE(aio_waitcomplete(&iocb, NULL) == 1);
 
 	/* Determine which request completed and verify the data was read. */
 	if (iocb == &ioreq[0].iocb)
 		i = 0;
 	else
 		i = 1;
 	ATF_REQUIRE(ioreq[i].buffer[0] == c);
 
 	i ^= 1;
 
 	/*
 	 * Try to cancel the other request.  On broken systems this
 	 * will fail and the process will hang on exit.
 	 */
 	ATF_REQUIRE(aio_error(&ioreq[i].iocb) == EINPROGRESS);
 	ATF_REQUIRE(aio_cancel(s[0], &ioreq[i].iocb) == AIO_CANCELED);
 
 	close(s[1]);
 	close(s[0]);
 }
 
 /*
  * This test ensures that aio_write() on a blocking socket of a "large"
  * buffer does not return a short completion.
  */
 ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write);
 ATF_TC_BODY(aio_socket_blocking_short_write, tc)
 {
 	struct aiocb iocb, *iocbp;
 	char *buffer[2];
 	ssize_t done;
 	int buffer_size, sb_size;
 	socklen_t len;
 	int s[2];
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	ATF_REQUIRE(socketpair(PF_UNIX, SOCK_STREAM, 0, s) != -1);
 
 	len = sizeof(sb_size);
 	ATF_REQUIRE(getsockopt(s[0], SOL_SOCKET, SO_RCVBUF, &sb_size, &len) !=
 	    -1);
 	ATF_REQUIRE(len == sizeof(sb_size));
 	buffer_size = sb_size;
 
 	ATF_REQUIRE(getsockopt(s[1], SOL_SOCKET, SO_SNDBUF, &sb_size, &len) !=
 	    -1);
 	ATF_REQUIRE(len == sizeof(sb_size));
 	if (sb_size > buffer_size)
 		buffer_size = sb_size;
 
 	/*
 	 * Use twice the size of the MAX(receive buffer, send buffer)
 	 * to ensure that the write is split up into multiple writes
 	 * internally.
 	 */
 	buffer_size *= 2;
 
 	buffer[0] = malloc(buffer_size);
 	ATF_REQUIRE(buffer[0] != NULL);
 	buffer[1] = malloc(buffer_size);
 	ATF_REQUIRE(buffer[1] != NULL);
 
 	srandomdev();
 	aio_fill_buffer(buffer[1], buffer_size, random());
 
 	memset(&iocb, 0, sizeof(iocb));
 	iocb.aio_fildes = s[1];
 	iocb.aio_buf = buffer[1];
 	iocb.aio_nbytes = buffer_size;
 	ATF_REQUIRE(aio_write(&iocb) == 0);
 
 	done = recv(s[0], buffer[0], buffer_size, MSG_WAITALL);
 	ATF_REQUIRE(done == buffer_size);
 
 	done = aio_waitcomplete(&iocbp, NULL);
 	ATF_REQUIRE(iocbp == &iocb);
 	ATF_REQUIRE(done == buffer_size);
 
 	ATF_REQUIRE(memcmp(buffer[0], buffer[1], buffer_size) == 0);
 
 	close(s[1]);
 	close(s[0]);
 }
 
 /*
  * This test verifies that cancelling a partially completed socket write
  * returns a short write rather than ECANCELED.
  */
 ATF_TC_WITHOUT_HEAD(aio_socket_short_write_cancel);
 ATF_TC_BODY(aio_socket_short_write_cancel, tc)
 {
 	struct aiocb iocb, *iocbp;
 	char *buffer[2];
 	ssize_t done;
 	int buffer_size, sb_size;
 	socklen_t len;
 	int s[2];
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	ATF_REQUIRE(socketpair(PF_UNIX, SOCK_STREAM, 0, s) != -1);
 
 	len = sizeof(sb_size);
 	ATF_REQUIRE(getsockopt(s[0], SOL_SOCKET, SO_RCVBUF, &sb_size, &len) !=
 	    -1);
 	ATF_REQUIRE(len == sizeof(sb_size));
 	buffer_size = sb_size;
 
 	ATF_REQUIRE(getsockopt(s[1], SOL_SOCKET, SO_SNDBUF, &sb_size, &len) !=
 	    -1);
 	ATF_REQUIRE(len == sizeof(sb_size));
 	if (sb_size > buffer_size)
 		buffer_size = sb_size;
 
 	/*
 	 * Use three times the size of the MAX(receive buffer, send
 	 * buffer) for the write to ensure that the write is split up
 	 * into multiple writes internally.  The recv() ensures that
 	 * the write has partially completed, but a remaining size of
 	 * two buffers should ensure that the write has not completed
 	 * fully when it is cancelled.
 	 */
 	buffer[0] = malloc(buffer_size);
 	ATF_REQUIRE(buffer[0] != NULL);
 	buffer[1] = malloc(buffer_size * 3);
 	ATF_REQUIRE(buffer[1] != NULL);
 
 	srandomdev();
 	aio_fill_buffer(buffer[1], buffer_size * 3, random());
 
 	memset(&iocb, 0, sizeof(iocb));
 	iocb.aio_fildes = s[1];
 	iocb.aio_buf = buffer[1];
 	iocb.aio_nbytes = buffer_size * 3;
 	ATF_REQUIRE(aio_write(&iocb) == 0);
 
 	done = recv(s[0], buffer[0], buffer_size, MSG_WAITALL);
 	ATF_REQUIRE(done == buffer_size);
 
 	ATF_REQUIRE(aio_error(&iocb) == EINPROGRESS);
 	ATF_REQUIRE(aio_cancel(s[1], &iocb) == AIO_NOTCANCELED);
 
 	done = aio_waitcomplete(&iocbp, NULL);
 	ATF_REQUIRE(iocbp == &iocb);
 	ATF_REQUIRE(done >= buffer_size && done <= buffer_size * 2);
 
 	ATF_REQUIRE(memcmp(buffer[0], buffer[1], buffer_size) == 0);
 
 	close(s[1]);
 	close(s[0]);
 }
 
+/*
+ * This test just performs a basic test of aio_fsync().
+ */
+ATF_TC_WITHOUT_HEAD(aio_fsync_test);
+ATF_TC_BODY(aio_fsync_test, tc)
+{
+	struct aiocb synccb, *iocbp;
+	struct {
+		struct aiocb iocb;
+		bool done;
+		char *buffer;
+	} buffers[16];
+	struct stat sb;
+	char pathname[PATH_MAX];
+	ssize_t rval;
+	unsigned i;
+	int fd;
+
+	ATF_REQUIRE_KERNEL_MODULE("aio");
+	ATF_REQUIRE_UNSAFE_AIO();
+
+	strcpy(pathname, PATH_TEMPLATE);
+	fd = mkstemp(pathname);
+	ATF_REQUIRE_MSG(fd != -1, "mkstemp failed: %s", strerror(errno));
+	unlink(pathname);
+
+	ATF_REQUIRE(fstat(fd, &sb) == 0);
+	ATF_REQUIRE(sb.st_blksize != 0);
+	ATF_REQUIRE(ftruncate(fd, sb.st_blksize * nitems(buffers)) == 0);
+
+	/*
+	 * Queue several asynchronous write requests.  Hopefully this
+	 * forces the aio_fsync() request to be deferred.  There is no
+	 * reliable way to guarantee that however.
+	 */
+	srandomdev();
+	for (i = 0; i < nitems(buffers); i++) {
+		buffers[i].done = false;
+		memset(&buffers[i].iocb, 0, sizeof(buffers[i].iocb));
+		buffers[i].buffer = malloc(sb.st_blksize);
+		aio_fill_buffer(buffers[i].buffer, sb.st_blksize, random());
+		buffers[i].iocb.aio_fildes = fd;
+		buffers[i].iocb.aio_buf = buffers[i].buffer;
+		buffers[i].iocb.aio_nbytes = sb.st_blksize;
+		buffers[i].iocb.aio_offset = sb.st_blksize * i;
+		ATF_REQUIRE(aio_write(&buffers[i].iocb) == 0);
+	}
+
+	/* Queue the aio_fsync request. */
+	memset(&synccb, 0, sizeof(synccb));
+	synccb.aio_fildes = fd;
+	ATF_REQUIRE(aio_fsync(O_SYNC, &synccb) == 0);
+
+	/* Wait for requests to complete. */
+	for (;;) {
+	next:
+		rval = aio_waitcomplete(&iocbp, NULL);
+		ATF_REQUIRE(iocbp != NULL);
+		if (iocbp == &synccb) {
+			ATF_REQUIRE(rval == 0);
+			break;
+		}
+
+		for (i = 0; i < nitems(buffers); i++) {
+			if (iocbp == &buffers[i].iocb) {
+				ATF_REQUIRE(buffers[i].done == false);
+				ATF_REQUIRE(rval == sb.st_blksize);
+				buffers[i].done = true;
+				goto next;
+			}
+		}
+
+		ATF_REQUIRE_MSG(false, "unmatched AIO request");
+	}
+
+	for (i = 0; i < nitems(buffers); i++)
+		ATF_REQUIRE_MSG(buffers[i].done,
+		    "AIO request %u did not complete", i);
+
+	close(fd);
+}
+
 ATF_TP_ADD_TCS(tp)
 {
 
 	ATF_TP_ADD_TC(tp, aio_file_test);
 	ATF_TP_ADD_TC(tp, aio_fifo_test);
 	ATF_TP_ADD_TC(tp, aio_unix_socketpair_test);
 	ATF_TP_ADD_TC(tp, aio_pty_test);
 	ATF_TP_ADD_TC(tp, aio_pipe_test);
 	ATF_TP_ADD_TC(tp, aio_md_test);
 	ATF_TP_ADD_TC(tp, aio_large_read_test);
 	ATF_TP_ADD_TC(tp, aio_socket_two_reads);
 	ATF_TP_ADD_TC(tp, aio_socket_blocking_short_write);
 	ATF_TP_ADD_TC(tp, aio_socket_short_write_cancel);
+	ATF_TP_ADD_TC(tp, aio_fsync_test);
 
 	return (atf_no_error());
 }
Index: user/alc/PQ_LAUNDRY/usr.bin/indent/args.c
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/indent/args.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/usr.bin/indent/args.c	(revision 303517)
@@ -1,327 +1,327 @@
 /*
  * Copyright (c) 1985 Sun Microsystems, Inc.
  * Copyright (c) 1980, 1993
  *	The Regents of the University of California.  All rights reserved.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)args.c	8.1 (Berkeley) 6/6/93";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Argument scanning and profile reading code.  Default parameters are set
  * here as well.
  */
 
 #include <ctype.h>
 #include <err.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "indent_globs.h"
 #include "indent.h"
 
 /* profile types */
 #define	PRO_SPECIAL	1	/* special case */
 #define	PRO_BOOL	2	/* boolean */
 #define	PRO_INT		3	/* integer */
 #define PRO_FONT	4	/* troff font */
 
 /* profile specials for booleans */
 #define	ON		1	/* turn it on */
 #define	OFF		0	/* turn it off */
 
 /* profile specials for specials */
 #define	IGN		1	/* ignore it */
 #define	CLI		2	/* case label indent (float) */
 #define	STDIN		3	/* use stdin */
 #define	KEY		4	/* type (keyword) */
 
 static void scan_profile(FILE *);
 
 const char *option_source = "?";
 
 /*
  * N.B.: because of the way the table here is scanned, options whose names are
  * substrings of other options must occur later; that is, with -lp vs -l, -lp
  * must be first.  Also, while (most) booleans occur more than once, the last
  * default value is the one actually assigned.
  */
 struct pro {
     const char *p_name;		/* name, e.g. -bl, -cli */
     int         p_type;		/* type (int, bool, special) */
     int         p_default;	/* the default value (if int) */
     int         p_special;	/* depends on type */
     int        *p_obj;		/* the associated variable */
 }           pro[] = {
 
     {"T", PRO_SPECIAL, 0, KEY, 0},
     {"bacc", PRO_BOOL, false, ON, &blanklines_around_conditional_compilation},
     {"badp", PRO_BOOL, false, ON, &blanklines_after_declarations_at_proctop},
     {"bad", PRO_BOOL, false, ON, &blanklines_after_declarations},
     {"bap", PRO_BOOL, false, ON, &blanklines_after_procs},
     {"bbb", PRO_BOOL, false, ON, &blanklines_before_blockcomments},
     {"bc", PRO_BOOL, true, OFF, &ps.leave_comma},
     {"bl", PRO_BOOL, true, OFF, &btype_2},
     {"br", PRO_BOOL, true, ON, &btype_2},
     {"bs", PRO_BOOL, false, ON, &Bill_Shannon},
     {"cdb", PRO_BOOL, true, ON, &comment_delimiter_on_blankline},
     {"cd", PRO_INT, 0, 0, &ps.decl_com_ind},
     {"ce", PRO_BOOL, true, ON, &cuddle_else},
     {"ci", PRO_INT, 0, 0, &continuation_indent},
     {"cli", PRO_SPECIAL, 0, CLI, 0},
     {"c", PRO_INT, 33, 0, &ps.com_ind},
     {"di", PRO_INT, 16, 0, &ps.decl_indent},
     {"dj", PRO_BOOL, false, ON, &ps.ljust_decl},
     {"d", PRO_INT, 0, 0, &ps.unindent_displace},
     {"eei", PRO_BOOL, false, ON, &extra_expression_indent},
     {"ei", PRO_BOOL, true, ON, &ps.else_if},
     {"fbc", PRO_FONT, 0, 0, (int *) &blkcomf},
     {"fbs", PRO_BOOL, true, ON, &function_brace_split},
     {"fbx", PRO_FONT, 0, 0, (int *) &boxcomf},
     {"fb", PRO_FONT, 0, 0, (int *) &bodyf},
     {"fc1", PRO_BOOL, true, ON, &format_col1_comments},
     {"fcb", PRO_BOOL, true, ON, &format_block_comments},
     {"fc", PRO_FONT, 0, 0, (int *) &scomf},
     {"fk", PRO_FONT, 0, 0, (int *) &keywordf},
     {"fs", PRO_FONT, 0, 0, (int *) &stringf},
     {"ip", PRO_BOOL, true, ON, &ps.indent_parameters},
     {"i", PRO_INT, 8, 0, &ps.ind_size},
     {"lc", PRO_INT, 0, 0, &block_comment_max_col},
     {"ldi", PRO_INT, -1, 0, &ps.local_decl_indent},
     {"lp", PRO_BOOL, true, ON, &lineup_to_parens},
     {"l", PRO_INT, 78, 0, &max_col},
     {"nbacc", PRO_BOOL, false, OFF, &blanklines_around_conditional_compilation},
     {"nbadp", PRO_BOOL, false, OFF, &blanklines_after_declarations_at_proctop},
     {"nbad", PRO_BOOL, false, OFF, &blanklines_after_declarations},
     {"nbap", PRO_BOOL, false, OFF, &blanklines_after_procs},
     {"nbbb", PRO_BOOL, false, OFF, &blanklines_before_blockcomments},
     {"nbc", PRO_BOOL, true, ON, &ps.leave_comma},
     {"nbs", PRO_BOOL, false, OFF, &Bill_Shannon},
     {"ncdb", PRO_BOOL, true, OFF, &comment_delimiter_on_blankline},
     {"nce", PRO_BOOL, true, OFF, &cuddle_else},
     {"ndj", PRO_BOOL, false, OFF, &ps.ljust_decl},
     {"neei", PRO_BOOL, false, OFF, &extra_expression_indent},
     {"nei", PRO_BOOL, true, OFF, &ps.else_if},
     {"nfbs", PRO_BOOL, true, OFF, &function_brace_split},
     {"nfc1", PRO_BOOL, true, OFF, &format_col1_comments},
     {"nfcb", PRO_BOOL, true, OFF, &format_block_comments},
     {"nip", PRO_BOOL, true, OFF, &ps.indent_parameters},
     {"nlp", PRO_BOOL, true, OFF, &lineup_to_parens},
     {"npcs", PRO_BOOL, false, OFF, &proc_calls_space},
     {"npro", PRO_SPECIAL, 0, IGN, 0},
     {"npsl", PRO_BOOL, true, OFF, &procnames_start_line},
     {"nps", PRO_BOOL, false, OFF, &pointer_as_binop},
     {"nsc", PRO_BOOL, true, OFF, &star_comment_cont},
     {"nsob", PRO_BOOL, false, OFF, &swallow_optional_blanklines},
     {"nut", PRO_BOOL, true, OFF, &use_tabs},
     {"nv", PRO_BOOL, false, OFF, &verbose},
     {"pcs", PRO_BOOL, false, ON, &proc_calls_space},
     {"psl", PRO_BOOL, true, ON, &procnames_start_line},
     {"ps", PRO_BOOL, false, ON, &pointer_as_binop},
     {"sc", PRO_BOOL, true, ON, &star_comment_cont},
     {"sob", PRO_BOOL, false, ON, &swallow_optional_blanklines},
     {"st", PRO_SPECIAL, 0, STDIN, 0},
     {"ta", PRO_BOOL, false, ON, &auto_typedefs},
     {"troff", PRO_BOOL, false, ON, &troff},
     {"ut", PRO_BOOL, true, ON, &use_tabs},
     {"v", PRO_BOOL, false, ON, &verbose},
     /* whew! */
     {0, 0, 0, 0, 0}
 };
 
 /*
  * set_profile reads $HOME/.indent.pro and ./.indent.pro and handles arguments
  * given in these files.
  */
 void
 set_profile(void)
 {
     FILE *f;
     char fname[PATH_MAX];
     static char prof[] = ".indent.pro";
 
     snprintf(fname, sizeof(fname), "%s/%s", getenv("HOME"), prof);
     if ((f = fopen(option_source = fname, "r")) != NULL) {
 	scan_profile(f);
 	(void) fclose(f);
     }
     if ((f = fopen(option_source = prof, "r")) != NULL) {
 	scan_profile(f);
 	(void) fclose(f);
     }
     option_source = "Command line";
 }
 
 static void
 scan_profile(FILE *f)
 {
     int		comment, i;
     char	*p;
     char        buf[BUFSIZ];
 
     while (1) {
 	p = buf;
 	comment = 0;
 	while ((i = getc(f)) != EOF) {
 	    if (i == '*' && !comment && p > buf && p[-1] == '/') {
 		comment = p - buf;
 		*p++ = i;
 	    } else if (i == '/' && comment && p > buf && p[-1] == '*') {
 		p = buf + comment - 1;
 		comment = 0;
 	    } else if (isspace(i)) {
 		if (p > buf && !comment)
 		    break;
 	    } else {
 		*p++ = i;
 	    }
 	}
 	if (p != buf) {
 	    *p++ = 0;
 	    if (verbose)
 		printf("profile: %s\n", buf);
 	    set_option(buf);
 	}
 	else if (i == EOF)
 	    return;
     }
 }
 
 const char	*param_start;
 
 static int
 eqin(const char *s1, const char *s2)
 {
     while (*s1) {
 	if (*s1++ != *s2++)
 	    return (false);
     }
     param_start = s2;
     return (true);
 }
 
 /*
  * Set the defaults.
  */
 void
 set_defaults(void)
 {
     struct pro *p;
 
     /*
      * Because ps.case_indent is a float, we can't initialize it from the
      * table:
      */
     ps.case_indent = 0.0;	/* -cli0.0 */
     for (p = pro; p->p_name; p++)
 	if (p->p_type != PRO_SPECIAL && p->p_type != PRO_FONT)
 	    *p->p_obj = p->p_default;
 }
 
 void
 set_option(char *arg)
 {
     struct pro *p;
 
     arg++;			/* ignore leading "-" */
     for (p = pro; p->p_name; p++)
 	if (*p->p_name == *arg && eqin(p->p_name, arg))
 	    goto found;
     errx(1, "%s: unknown parameter \"%s\"", option_source, arg - 1);
 found:
     switch (p->p_type) {
 
     case PRO_SPECIAL:
 	switch (p->p_special) {
 
 	case IGN:
 	    break;
 
 	case CLI:
 	    if (*param_start == 0)
 		goto need_param;
 	    ps.case_indent = atof(param_start);
 	    break;
 
 	case STDIN:
-	    if (input == 0)
+	    if (input == NULL)
 		input = stdin;
-	    if (output == 0)
+	    if (output == NULL)
 		output = stdout;
 	    break;
 
 	case KEY:
 	    if (*param_start == 0)
 		goto need_param;
 	    {
 		char *str = strdup(param_start);
 		if (str == NULL)
 			err(1, NULL);
 		addkey(str, 4);
 	    }
 	    break;
 
 	default:
 	    errx(1, "set_option: internal error: p_special %d", p->p_special);
 	}
 	break;
 
     case PRO_BOOL:
 	if (p->p_special == OFF)
 	    *p->p_obj = false;
 	else
 	    *p->p_obj = true;
 	break;
 
     case PRO_INT:
 	if (!isdigit(*param_start)) {
     need_param:
 	    errx(1, "%s: ``%s'' requires a parameter", option_source, arg - 1);
 	}
 	*p->p_obj = atoi(param_start);
 	break;
 
     case PRO_FONT:
 	parsefont((struct fstate *) p->p_obj, param_start);
 	break;
 
     default:
 	errx(1, "set_option: internal error: p_type %d", p->p_type);
     }
 }
Index: user/alc/PQ_LAUNDRY/usr.bin/indent/indent.c
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/indent/indent.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/usr.bin/indent/indent.c	(revision 303517)
@@ -1,1240 +1,1240 @@
 /*
  * Copyright (c) 1985 Sun Microsystems, Inc.
  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
  * Copyright (c) 1980, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1985 Sun Microsystems, Inc.\n\
 @(#) Copyright (c) 1976 Board of Trustees of the University of Illinois.\n\
 @(#) Copyright (c) 1980, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)indent.c	5.17 (Berkeley) 6/7/93";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <err.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include "indent_globs.h"
 #include "indent_codes.h"
 #include "indent.h"
 
 static void bakcopy(void);
 
 const char *in_name = "Standard Input";	/* will always point to name of input
 					 * file */
 const char *out_name = "Standard Output";	/* will always point to name
 						 * of output file */
 char        bakfile[MAXPATHLEN] = "";
 
 int
 main(int argc, char **argv)
 {
 
     int         dec_ind;	/* current indentation for declarations */
     int         di_stack[20];	/* a stack of structure indentation levels */
     int         flushed_nl;	/* used when buffering up comments to remember
 				 * that a newline was passed over */
     int         force_nl;	/* when true, code must be broken */
     int         hd_type = 0;	/* used to store type of stmt for if (...),
 				 * for (...), etc */
     int		i;		/* local loop counter */
     int         scase;		/* set to true when we see a case, so we will
 				 * know what to do with the following colon */
     int         sp_sw;		/* when true, we are in the expression of
 				 * if(...), while(...), etc. */
     int         squest;		/* when this is positive, we have seen a ?
 				 * without the matching : in a <c>?<s>:<s>
 				 * construct */
     const char *t_ptr;		/* used for copying tokens */
     int		tabs_to_var;	/* true if using tabs to indent to var name */
     int         type_code;	/* the type of token, returned by lexi */
 
     int         last_else = 0;	/* true iff last keyword was an else */
 
 
     /*-----------------------------------------------*\
     |		      INITIALIZATION		      |
     \*-----------------------------------------------*/
 
     found_err = 0;
 
     ps.p_stack[0] = stmt;	/* this is the parser's stack */
     ps.last_nl = true;		/* this is true if the last thing scanned was
 				 * a newline */
     ps.last_token = semicolon;
     combuf = (char *) malloc(bufsize);
     if (combuf == NULL)
 	err(1, NULL);
     labbuf = (char *) malloc(bufsize);
     if (labbuf == NULL)
 	err(1, NULL);
     codebuf = (char *) malloc(bufsize);
     if (codebuf == NULL)
 	err(1, NULL);
     tokenbuf = (char *) malloc(bufsize);
     if (tokenbuf == NULL)
 	err(1, NULL);
     l_com = combuf + bufsize - 5;
     l_lab = labbuf + bufsize - 5;
     l_code = codebuf + bufsize - 5;
     l_token = tokenbuf + bufsize - 5;
     combuf[0] = codebuf[0] = labbuf[0] = ' ';	/* set up code, label, and
 						 * comment buffers */
     combuf[1] = codebuf[1] = labbuf[1] = '\0';
     ps.else_if = 1;		/* Default else-if special processing to on */
     s_lab = e_lab = labbuf + 1;
     s_code = e_code = codebuf + 1;
     s_com = e_com = combuf + 1;
     s_token = e_token = tokenbuf + 1;
 
     in_buffer = (char *) malloc(10);
     if (in_buffer == NULL)
 	err(1, NULL);
     in_buffer_limit = in_buffer + 8;
     buf_ptr = buf_end = in_buffer;
     line_no = 1;
     had_eof = ps.in_decl = ps.decl_on_line = break_comma = false;
     sp_sw = force_nl = false;
     ps.in_or_st = false;
     ps.bl_line = true;
     dec_ind = 0;
     di_stack[ps.dec_nest = 0] = 0;
     ps.want_blank = ps.in_stmt = ps.ind_stmt = false;
 
     scase = ps.pcase = false;
     squest = 0;
-    sc_end = 0;
-    bp_save = 0;
-    be_save = 0;
+    sc_end = NULL;
+    bp_save = NULL;
+    be_save = NULL;
 
-    output = 0;
+    output = NULL;
     tabs_to_var = 0;
 
     /*--------------------------------------------------*\
     |   		COMMAND LINE SCAN		 |
     \*--------------------------------------------------*/
 
 #ifdef undef
     max_col = 78;		/* -l78 */
     lineup_to_parens = 1;	/* -lp */
     ps.ljust_decl = 0;		/* -ndj */
     ps.com_ind = 33;		/* -c33 */
     star_comment_cont = 1;	/* -sc */
     ps.ind_size = 8;		/* -i8 */
     verbose = 0;
     ps.decl_indent = 16;	/* -di16 */
     ps.local_decl_indent = -1;	/* if this is not set to some nonnegative value
 				 * by an arg, we will set this equal to
 				 * ps.decl_ind */
     ps.indent_parameters = 1;	/* -ip */
     ps.decl_com_ind = 0;	/* if this is not set to some positive value
 				 * by an arg, we will set this equal to
 				 * ps.com_ind */
     btype_2 = 1;		/* -br */
     cuddle_else = 1;		/* -ce */
     ps.unindent_displace = 0;	/* -d0 */
     ps.case_indent = 0;		/* -cli0 */
     format_block_comments = 1;	/* -fcb */
     format_col1_comments = 1;	/* -fc1 */
     procnames_start_line = 1;	/* -psl */
     proc_calls_space = 0;	/* -npcs */
     comment_delimiter_on_blankline = 1;	/* -cdb */
     ps.leave_comma = 1;		/* -nbc */
 #endif
 
     for (i = 1; i < argc; ++i)
 	if (strcmp(argv[i], "-npro") == 0)
 	    break;
     set_defaults();
     if (i >= argc)
 	set_profile();
 
     for (i = 1; i < argc; ++i) {
 
 	/*
 	 * look thru args (if any) for changes to defaults
 	 */
 	if (argv[i][0] != '-') {/* no flag on parameter */
 	    if (input == NULL) {	/* we must have the input file */
 		in_name = argv[i];	/* remember name of input file */
 		input = fopen(in_name, "r");
 		if (input == NULL)	/* check for open error */
 			err(1, "%s", in_name);
 		continue;
 	    }
 	    else if (output == NULL) {	/* we have the output file */
 		out_name = argv[i];	/* remember name of output file */
 		if (strcmp(in_name, out_name) == 0) {	/* attempt to overwrite
 							 * the file */
 		    errx(1, "input and output files must be different");
 		}
 		output = fopen(out_name, "w");
 		if (output == NULL)	/* check for create error */
 			err(1, "%s", out_name);
 		continue;
 	    }
 	    errx(1, "unknown parameter: %s", argv[i]);
 	}
 	else
 	    set_option(argv[i]);
     }				/* end of for */
     if (input == NULL)
 	input = stdin;
     if (output == NULL) {
 	if (troff || input == stdin)
 	    output = stdout;
 	else {
 	    out_name = in_name;
 	    bakcopy();
 	}
     }
     if (ps.com_ind <= 1)
 	ps.com_ind = 2;		/* dont put normal comments before column 2 */
     if (troff) {
 	if (bodyf.font[0] == 0)
 	    parsefont(&bodyf, "R");
 	if (scomf.font[0] == 0)
 	    parsefont(&scomf, "I");
 	if (blkcomf.font[0] == 0)
 	    blkcomf = scomf, blkcomf.size += 2;
 	if (boxcomf.font[0] == 0)
 	    boxcomf = blkcomf;
 	if (stringf.font[0] == 0)
 	    parsefont(&stringf, "L");
 	if (keywordf.font[0] == 0)
 	    parsefont(&keywordf, "B");
 	writefdef(&bodyf, 'B');
 	writefdef(&scomf, 'C');
 	writefdef(&blkcomf, 'L');
 	writefdef(&boxcomf, 'X');
 	writefdef(&stringf, 'S');
 	writefdef(&keywordf, 'K');
     }
     if (block_comment_max_col <= 0)
 	block_comment_max_col = max_col;
     if (ps.local_decl_indent < 0)	/* if not specified by user, set this */
 	ps.local_decl_indent = ps.decl_indent;
     if (ps.decl_com_ind <= 0)	/* if not specified by user, set this */
 	ps.decl_com_ind = ps.ljust_decl ? (ps.com_ind <= 10 ? 2 : ps.com_ind - 8) : ps.com_ind;
     if (continuation_indent == 0)
 	continuation_indent = ps.ind_size;
     fill_buffer();		/* get first batch of stuff into input buffer */
 
     parse(semicolon);
     {
 	char *p = buf_ptr;
 	int col = 1;
 
 	while (1) {
 	    if (*p == ' ')
 		col++;
 	    else if (*p == '\t')
 		col = ((col - 1) & ~7) + 9;
 	    else
 		break;
 	    p++;
 	}
 	if (col > ps.ind_size)
 	    ps.ind_level = ps.i_l_follow = col / ps.ind_size;
     }
     if (troff) {
 	const char *p = in_name,
 	           *beg = in_name;
 
 	while (*p)
 	    if (*p++ == '/')
 		beg = p;
 	fprintf(output, ".Fn \"%s\"\n", beg);
     }
     /*
      * START OF MAIN LOOP
      */
 
     while (1) {			/* this is the main loop.  it will go until we
 				 * reach eof */
 	int         is_procname;
 
 	type_code = lexi();	/* lexi reads one token.  The actual
 				 * characters read are stored in "token". lexi
 				 * returns a code indicating the type of token */
 	is_procname = ps.procname[0];
 
 	/*
 	 * The following code moves everything following an if (), while (),
 	 * else, etc. up to the start of the following stmt to a buffer. This
 	 * allows proper handling of both kinds of brace placement.
 	 */
 
 	flushed_nl = false;
 	while (ps.search_brace) {	/* if we scanned an if(), while(),
 					 * etc., we might need to copy stuff
 					 * into a buffer we must loop, copying
 					 * stuff into save_com, until we find
 					 * the start of the stmt which follows
 					 * the if, or whatever */
 	    switch (type_code) {
 	    case newline:
 		++line_no;
 		if (sc_end != NULL)
 		    goto sw_buffer;	/* dump comment, if any */
 		flushed_nl = true;
 	    case form_feed:
 		break;		/* form feeds and newlines found here will be
 				 * ignored */
 
 	    case lbrace:	/* this is a brace that starts the compound
 				 * stmt */
-		if (sc_end == 0) {	/* ignore buffering if a comment wasn't
+		if (sc_end == NULL) {	/* ignore buffering if a comment wasn't
 					 * stored up */
 		    ps.search_brace = false;
 		    goto check_type;
 		}
 		if (btype_2) {
 		    save_com[0] = '{';	/* we either want to put the brace
 					 * right after the if */
 		    goto sw_buffer;	/* go to common code to get out of
 					 * this loop */
 		}
 	    case comment:	/* we have a comment, so we must copy it into
 				 * the buffer */
-		if (!flushed_nl || sc_end != 0) {
-		    if (sc_end == 0) {	/* if this is the first comment, we
+		if (!flushed_nl || sc_end != NULL) {
+		    if (sc_end == NULL) {	/* if this is the first comment, we
 					 * must set up the buffer */
 			save_com[0] = save_com[1] = ' ';
 			sc_end = &(save_com[2]);
 		    }
 		    else {
 			*sc_end++ = '\n';	/* add newline between
 						 * comments */
 			*sc_end++ = ' ';
 			--line_no;
 		    }
 		    *sc_end++ = '/';	/* copy in start of comment */
 		    *sc_end++ = '*';
 
 		    for (;;) {	/* loop until we get to the end of the comment */
 			*sc_end = *buf_ptr++;
 			if (buf_ptr >= buf_end)
 			    fill_buffer();
 
 			if (*sc_end++ == '*' && *buf_ptr == '/')
 			    break;	/* we are at end of comment */
 
 			if (sc_end >= &(save_com[sc_size])) {	/* check for temp buffer
 								 * overflow */
 			    diag2(1, "Internal buffer overflow - Move big comment from right after if, while, or whatever");
 			    fflush(output);
 			    exit(1);
 			}
 		    }
 		    *sc_end++ = '/';	/* add ending slash */
 		    if (++buf_ptr >= buf_end)	/* get past / in buffer */
 			fill_buffer();
 		    break;
 		}
 	    default:		/* it is the start of a normal statement */
 		if (flushed_nl)	/* if we flushed a newline, make sure it is
 				 * put back */
 		    force_nl = true;
 		if ((type_code == sp_paren && *token == 'i'
 			&& last_else && ps.else_if)
 			|| (type_code == sp_nparen && *token == 'e'
 			&& e_code != s_code && e_code[-1] == '}'))
 		    force_nl = false;
 
-		if (sc_end == 0) {	/* ignore buffering if comment wasn't
+		if (sc_end == NULL) {	/* ignore buffering if comment wasn't
 					 * saved up */
 		    ps.search_brace = false;
 		    goto check_type;
 		}
 		if (force_nl) {	/* if we should insert a nl here, put it into
 				 * the buffer */
 		    force_nl = false;
 		    --line_no;	/* this will be re-increased when the nl is
 				 * read from the buffer */
 		    *sc_end++ = '\n';
 		    *sc_end++ = ' ';
 		    if (verbose && !flushed_nl)	/* print error msg if the line
 						 * was not already broken */
 			diag2(0, "Line broken");
 		    flushed_nl = false;
 		}
 		for (t_ptr = token; *t_ptr; ++t_ptr)
 		    *sc_end++ = *t_ptr;	/* copy token into temp buffer */
 		ps.procname[0] = 0;
 
 	sw_buffer:
 		ps.search_brace = false;	/* stop looking for start of
 						 * stmt */
 		bp_save = buf_ptr;	/* save current input buffer */
 		be_save = buf_end;
 		buf_ptr = save_com;	/* fix so that subsequent calls to
 					 * lexi will take tokens out of
 					 * save_com */
 		*sc_end++ = ' ';/* add trailing blank, just in case */
 		buf_end = sc_end;
-		sc_end = 0;
+		sc_end = NULL;
 		break;
 	    }			/* end of switch */
 	    if (type_code != 0)	/* we must make this check, just in case there
 				 * was an unexpected EOF */
 		type_code = lexi();	/* read another token */
 	    /* if (ps.search_brace) ps.procname[0] = 0; */
 	    if ((is_procname = ps.procname[0]) && flushed_nl
 		    && !procnames_start_line && ps.in_decl
 		    && type_code == ident)
 		flushed_nl = 0;
 	}			/* end of while (search_brace) */
 	last_else = 0;
 check_type:
 	if (type_code == 0) {	/* we got eof */
 	    if (s_lab != e_lab || s_code != e_code
 		    || s_com != e_com)	/* must dump end of line */
 		dump_line();
 	    if (ps.tos > 1)	/* check for balanced braces */
 		diag2(1, "Stuff missing from end of file");
 
 	    if (verbose) {
 		printf("There were %d output lines and %d comments\n",
 		       ps.out_lines, ps.out_coms);
 		printf("(Lines with comments)/(Lines with code): %6.3f\n",
 		       (1.0 * ps.com_lines) / code_lines);
 	    }
 	    fflush(output);
 	    exit(found_err);
 	}
 	if (
 		(type_code != comment) &&
 		(type_code != newline) &&
 		(type_code != preesc) &&
 		(type_code != form_feed)) {
 	    if (force_nl &&
 		    (type_code != semicolon) &&
 		    (type_code != lbrace || !btype_2)) {
 		/* we should force a broken line here */
 		if (verbose && !flushed_nl)
 		    diag2(0, "Line broken");
 		flushed_nl = false;
 		dump_line();
 		ps.want_blank = false;	/* dont insert blank at line start */
 		force_nl = false;
 	    }
 	    ps.in_stmt = true;	/* turn on flag which causes an extra level of
 				 * indentation. this is turned off by a ; or
 				 * '}' */
 	    if (s_com != e_com) {	/* the turkey has embedded a comment
 					 * in a line. fix it */
 		*e_code++ = ' ';
 		for (t_ptr = s_com; *t_ptr; ++t_ptr) {
 		    CHECK_SIZE_CODE;
 		    *e_code++ = *t_ptr;
 		}
 		*e_code++ = ' ';
 		*e_code = '\0';	/* null terminate code sect */
 		ps.want_blank = false;
 		e_com = s_com;
 	    }
 	}
 	else if (type_code != comment)	/* preserve force_nl thru a comment */
 	    force_nl = false;	/* cancel forced newline after newline, form
 				 * feed, etc */
 
 
 
 	/*-----------------------------------------------------*\
 	|	   do switch on type of token scanned		|
 	\*-----------------------------------------------------*/
 	CHECK_SIZE_CODE;
 	switch (type_code) {	/* now, decide what to do with the token */
 
 	case form_feed:	/* found a form feed in line */
 	    ps.use_ff = true;	/* a form feed is treated much like a newline */
 	    dump_line();
 	    ps.want_blank = false;
 	    break;
 
 	case newline:
 	    if (ps.last_token != comma || ps.p_l_follow > 0
 		    || !ps.leave_comma || ps.block_init || !break_comma || s_com != e_com) {
 		dump_line();
 		ps.want_blank = false;
 	    }
 	    ++line_no;		/* keep track of input line number */
 	    break;
 
 	case lparen:		/* got a '(' or '[' */
 	    ++ps.p_l_follow;	/* count parens to make Healy happy */
 	    if (ps.want_blank && *token != '[' &&
 		    (ps.last_token != ident || proc_calls_space
 	      || (ps.its_a_keyword && (!ps.sizeof_keyword || Bill_Shannon))))
 		*e_code++ = ' ';
 	    if (ps.in_decl && !ps.block_init)
 		if (troff && !ps.dumped_decl_indent && !is_procname && ps.last_token == decl) {
 		    ps.dumped_decl_indent = 1;
 		    sprintf(e_code, "\n.Du %dp+\200p \"%s\"\n", dec_ind * 7, token);
 		    e_code += strlen(e_code);
 		}
 		else {
 		    while ((e_code - s_code) < dec_ind) {
 			CHECK_SIZE_CODE;
 			*e_code++ = ' ';
 		    }
 		    *e_code++ = token[0];
 		}
 	    else
 		*e_code++ = token[0];
 	    ps.paren_indents[ps.p_l_follow - 1] = e_code - s_code;
 	    if (sp_sw && ps.p_l_follow == 1 && extra_expression_indent
 		    && ps.paren_indents[0] < 2 * ps.ind_size)
 		ps.paren_indents[0] = 2 * ps.ind_size;
 	    ps.want_blank = false;
 	    if (ps.in_or_st && *token == '(' && ps.tos <= 2) {
 		/*
 		 * this is a kluge to make sure that declarations will be
 		 * aligned right if proc decl has an explicit type on it, i.e.
 		 * "int a(x) {..."
 		 */
 		parse(semicolon);	/* I said this was a kluge... */
 		ps.in_or_st = false;	/* turn off flag for structure decl or
 					 * initialization */
 	    }
 	    if (ps.sizeof_keyword)
 		ps.sizeof_mask |= 1 << ps.p_l_follow;
 	    break;
 
 	case rparen:		/* got a ')' or ']' */
 	    rparen_count--;
 	    if (ps.cast_mask & (1 << ps.p_l_follow) & ~ps.sizeof_mask) {
 		ps.last_u_d = true;
 		ps.cast_mask &= (1 << ps.p_l_follow) - 1;
 		ps.want_blank = false;
 	    } else
 		ps.want_blank = true;
 	    ps.sizeof_mask &= (1 << ps.p_l_follow) - 1;
 	    if (--ps.p_l_follow < 0) {
 		ps.p_l_follow = 0;
 		diag3(0, "Extra %c", *token);
 	    }
 	    if (e_code == s_code)	/* if the paren starts the line */
 		ps.paren_level = ps.p_l_follow;	/* then indent it */
 
 	    *e_code++ = token[0];
 
 	    if (sp_sw && (ps.p_l_follow == 0)) {	/* check for end of if
 							 * (...), or some such */
 		sp_sw = false;
 		force_nl = true;/* must force newline after if */
 		ps.last_u_d = true;	/* inform lexi that a following
 					 * operator is unary */
 		ps.in_stmt = false;	/* dont use stmt continuation
 					 * indentation */
 
 		parse(hd_type);	/* let parser worry about if, or whatever */
 	    }
 	    ps.search_brace = btype_2;	/* this should insure that constructs
 					 * such as main(){...} and int[]{...}
 					 * have their braces put in the right
 					 * place */
 	    break;
 
 	case unary_op:		/* this could be any unary operation */
 	    if (ps.want_blank)
 		*e_code++ = ' ';
 
 	    if (troff && !ps.dumped_decl_indent && ps.in_decl && !is_procname) {
 		sprintf(e_code, "\n.Du %dp+\200p \"%s\"\n", dec_ind * 7, token);
 		ps.dumped_decl_indent = 1;
 		e_code += strlen(e_code);
 	    }
 	    else {
 		const char *res = token;
 
 		if (ps.in_decl && !ps.block_init) {	/* if this is a unary op
 							 * in a declaration, we
 							 * should indent this
 							 * token */
 		    for (i = 0; token[i]; ++i);	/* find length of token */
 		    while ((e_code - s_code) < (dec_ind - i)) {
 			CHECK_SIZE_CODE;
 			*e_code++ = ' ';	/* pad it */
 		    }
 		}
 		if (troff && token[0] == '-' && token[1] == '>')
 		    res = "\\(->";
 		for (t_ptr = res; *t_ptr; ++t_ptr) {
 		    CHECK_SIZE_CODE;
 		    *e_code++ = *t_ptr;
 		}
 	    }
 	    ps.want_blank = false;
 	    break;
 
 	case binary_op:	/* any binary operation */
 	    if (ps.want_blank)
 		*e_code++ = ' ';
 	    {
 		const char *res = token;
 
 		if (troff)
 		    switch (token[0]) {
 		    case '<':
 			if (token[1] == '=')
 			    res = "\\(<=";
 			break;
 		    case '>':
 			if (token[1] == '=')
 			    res = "\\(>=";
 			break;
 		    case '!':
 			if (token[1] == '=')
 			    res = "\\(!=";
 			break;
 		    case '|':
 			if (token[1] == '|')
 			    res = "\\(br\\(br";
 			else if (token[1] == 0)
 			    res = "\\(br";
 			break;
 		    }
 		for (t_ptr = res; *t_ptr; ++t_ptr) {
 		    CHECK_SIZE_CODE;
 		    *e_code++ = *t_ptr;	/* move the operator */
 		}
 	    }
 	    ps.want_blank = true;
 	    break;
 
 	case postop:		/* got a trailing ++ or -- */
 	    *e_code++ = token[0];
 	    *e_code++ = token[1];
 	    ps.want_blank = true;
 	    break;
 
 	case question:		/* got a ? */
 	    squest++;		/* this will be used when a later colon
 				 * appears so we can distinguish the
 				 * <c>?<n>:<n> construct */
 	    if (ps.want_blank)
 		*e_code++ = ' ';
 	    *e_code++ = '?';
 	    ps.want_blank = true;
 	    break;
 
 	case casestmt:		/* got word 'case' or 'default' */
 	    scase = true;	/* so we can process the later colon properly */
 	    goto copy_id;
 
 	case colon:		/* got a ':' */
 	    if (squest > 0) {	/* it is part of the <c>?<n>: <n> construct */
 		--squest;
 		if (ps.want_blank)
 		    *e_code++ = ' ';
 		*e_code++ = ':';
 		ps.want_blank = true;
 		break;
 	    }
 	    if (ps.in_or_st) {
 		*e_code++ = ':';
 		ps.want_blank = false;
 		break;
 	    }
 	    ps.in_stmt = false;	/* seeing a label does not imply we are in a
 				 * stmt */
 	    for (t_ptr = s_code; *t_ptr; ++t_ptr)
 		*e_lab++ = *t_ptr;	/* turn everything so far into a label */
 	    e_code = s_code;
 	    *e_lab++ = ':';
 	    *e_lab++ = ' ';
 	    *e_lab = '\0';
 
 	    force_nl = ps.pcase = scase;	/* ps.pcase will be used by
 						 * dump_line to decide how to
 						 * indent the label. force_nl
 						 * will force a case n: to be
 						 * on a line by itself */
 	    scase = false;
 	    ps.want_blank = false;
 	    break;
 
 	case semicolon:	/* got a ';' */
 	    if (ps.dec_nest == 0) {
 		/* we are not in an initialization or structure declaration */
 		ps.in_or_st = false;
 	    }
 	    scase = false;	/* these will only need resetting in an error */
 	    squest = 0;
 	    if (ps.last_token == rparen && rparen_count == 0)
 		ps.in_parameter_declaration = 0;
 	    ps.cast_mask = 0;
 	    ps.sizeof_mask = 0;
 	    ps.block_init = 0;
 	    ps.block_init_level = 0;
 	    ps.just_saw_decl--;
 
 	    if (ps.in_decl && s_code == e_code && !ps.block_init)
 		while ((e_code - s_code) < (dec_ind - 1)) {
 		    CHECK_SIZE_CODE;
 		    *e_code++ = ' ';
 		}
 
 	    ps.in_decl = (ps.dec_nest > 0);	/* if we were in a first level
 						 * structure declaration, we
 						 * arent any more */
 
 	    if ((!sp_sw || hd_type != forstmt) && ps.p_l_follow > 0) {
 
 		/*
 		 * This should be true iff there were unbalanced parens in the
 		 * stmt.  It is a bit complicated, because the semicolon might
 		 * be in a for stmt
 		 */
 		diag2(1, "Unbalanced parens");
 		ps.p_l_follow = 0;
 		if (sp_sw) {	/* this is a check for an if, while, etc. with
 				 * unbalanced parens */
 		    sp_sw = false;
 		    parse(hd_type);	/* dont lose the if, or whatever */
 		}
 	    }
 	    *e_code++ = ';';
 	    ps.want_blank = true;
 	    ps.in_stmt = (ps.p_l_follow > 0);	/* we are no longer in the
 						 * middle of a stmt */
 
 	    if (!sp_sw) {	/* if not if for (;;) */
 		parse(semicolon);	/* let parser know about end of stmt */
 		force_nl = true;/* force newline after an end of stmt */
 	    }
 	    break;
 
 	case lbrace:		/* got a '{' */
 	    ps.in_stmt = false;	/* dont indent the {} */
 	    if (!ps.block_init)
 		force_nl = true;/* force other stuff on same line as '{' onto
 				 * new line */
 	    else if (ps.block_init_level <= 0)
 		ps.block_init_level = 1;
 	    else
 		ps.block_init_level++;
 
 	    if (s_code != e_code && !ps.block_init) {
 		if (!btype_2) {
 		    dump_line();
 		    ps.want_blank = false;
 		}
 		else if (ps.in_parameter_declaration && !ps.in_or_st) {
 		    ps.i_l_follow = 0;
 		    if (function_brace_split) {	/* dump the line prior to the
 						 * brace ... */
 			dump_line();
 			ps.want_blank = false;
 		    } else	/* add a space between the decl and brace */
 			ps.want_blank = true;
 		}
 	    }
 	    if (ps.in_parameter_declaration)
 		prefix_blankline_requested = 0;
 
 	    if (ps.p_l_follow > 0) {	/* check for preceding unbalanced
 					 * parens */
 		diag2(1, "Unbalanced parens");
 		ps.p_l_follow = 0;
 		if (sp_sw) {	/* check for unclosed if, for, etc. */
 		    sp_sw = false;
 		    parse(hd_type);
 		    ps.ind_level = ps.i_l_follow;
 		}
 	    }
 	    if (s_code == e_code)
 		ps.ind_stmt = false;	/* dont put extra indentation on line
 					 * with '{' */
 	    if (ps.in_decl && ps.in_or_st) {	/* this is either a structure
 						 * declaration or an init */
 		di_stack[ps.dec_nest++] = dec_ind;
 		/* ?		dec_ind = 0; */
 	    }
 	    else {
 		ps.decl_on_line = false;	/* we can't be in the middle of
 						 * a declaration, so don't do
 						 * special indentation of
 						 * comments */
 		if (blanklines_after_declarations_at_proctop
 			&& ps.in_parameter_declaration)
 		    postfix_blankline_requested = 1;
 		ps.in_parameter_declaration = 0;
 	    }
 	    dec_ind = 0;
 	    parse(lbrace);	/* let parser know about this */
 	    if (ps.want_blank)	/* put a blank before '{' if '{' is not at
 				 * start of line */
 		*e_code++ = ' ';
 	    ps.want_blank = false;
 	    *e_code++ = '{';
 	    ps.just_saw_decl = 0;
 	    break;
 
 	case rbrace:		/* got a '}' */
 	    if (ps.p_stack[ps.tos] == decl && !ps.block_init)	/* semicolons can be
 								 * omitted in
 								 * declarations */
 		parse(semicolon);
 	    if (ps.p_l_follow) {/* check for unclosed if, for, else. */
 		diag2(1, "Unbalanced parens");
 		ps.p_l_follow = 0;
 		sp_sw = false;
 	    }
 	    ps.just_saw_decl = 0;
 	    ps.block_init_level--;
 	    if (s_code != e_code && !ps.block_init) {	/* '}' must be first on
 							 * line */
 		if (verbose)
 		    diag2(0, "Line broken");
 		dump_line();
 	    }
 	    *e_code++ = '}';
 	    ps.want_blank = true;
 	    ps.in_stmt = ps.ind_stmt = false;
 	    if (ps.dec_nest > 0) {	/* we are in multi-level structure
 					 * declaration */
 		dec_ind = di_stack[--ps.dec_nest];
 		if (ps.dec_nest == 0 && !ps.in_parameter_declaration)
 		    ps.just_saw_decl = 2;
 		ps.in_decl = true;
 	    }
 	    prefix_blankline_requested = 0;
 	    parse(rbrace);	/* let parser know about this */
 	    ps.search_brace = cuddle_else && ps.p_stack[ps.tos] == ifhead
 		&& ps.il[ps.tos] >= ps.ind_level;
 	    if (ps.tos <= 1 && blanklines_after_procs && ps.dec_nest <= 0)
 		postfix_blankline_requested = 1;
 	    break;
 
 	case swstmt:		/* got keyword "switch" */
 	    sp_sw = true;
 	    hd_type = swstmt;	/* keep this for when we have seen the
 				 * expression */
 	    goto copy_id;	/* go move the token into buffer */
 
 	case sp_paren:		/* token is if, while, for */
 	    sp_sw = true;	/* the interesting stuff is done after the
 				 * expression is scanned */
 	    hd_type = (*token == 'i' ? ifstmt :
 		       (*token == 'w' ? whilestmt : forstmt));
 
 	    /*
 	     * remember the type of header for later use by parser
 	     */
 	    goto copy_id;	/* copy the token into line */
 
 	case sp_nparen:	/* got else, do */
 	    ps.in_stmt = false;
 	    if (*token == 'e') {
 		if (e_code != s_code && (!cuddle_else || e_code[-1] != '}')) {
 		    if (verbose)
 			diag2(0, "Line broken");
 		    dump_line();/* make sure this starts a line */
 		    ps.want_blank = false;
 		}
 		force_nl = true;/* also, following stuff must go onto new line */
 		last_else = 1;
 		parse(elselit);
 	    }
 	    else {
 		if (e_code != s_code) {	/* make sure this starts a line */
 		    if (verbose)
 			diag2(0, "Line broken");
 		    dump_line();
 		    ps.want_blank = false;
 		}
 		force_nl = true;/* also, following stuff must go onto new line */
 		last_else = 0;
 		parse(dolit);
 	    }
 	    goto copy_id;	/* move the token into line */
 
 	case decl:		/* we have a declaration type (int, register,
 				 * etc.) */
 	    parse(decl);	/* let parser worry about indentation */
 	    if (ps.last_token == rparen && ps.tos <= 1) {
 		ps.in_parameter_declaration = 1;
 		if (s_code != e_code) {
 		    dump_line();
 		    ps.want_blank = 0;
 		}
 	    }
 	    if (ps.in_parameter_declaration && ps.indent_parameters && ps.dec_nest == 0) {
 		ps.ind_level = ps.i_l_follow = 1;
 		ps.ind_stmt = 0;
 	    }
 	    ps.in_or_st = true;	/* this might be a structure or initialization
 				 * declaration */
 	    ps.in_decl = ps.decl_on_line = true;
 	    if ( /* !ps.in_or_st && */ ps.dec_nest <= 0)
 		ps.just_saw_decl = 2;
 	    prefix_blankline_requested = 0;
 	    for (i = 0; token[i++];);	/* get length of token */
 
 	    if (ps.ind_level == 0 || ps.dec_nest > 0) {
 		/* global variable or struct member in local variable */
 		dec_ind = ps.decl_indent > 0 ? ps.decl_indent : i;
 		tabs_to_var = (use_tabs ? ps.decl_indent > 0 : 0);
 	    } else {
 		/* local variable */
 		dec_ind = ps.local_decl_indent > 0 ? ps.local_decl_indent : i;
 		tabs_to_var = (use_tabs ? ps.local_decl_indent > 0 : 0);
 	    }
 	    goto copy_id;
 
 	case ident:		/* got an identifier or constant */
 	    if (ps.in_decl) {	/* if we are in a declaration, we must indent
 				 * identifier */
 		if (is_procname == 0 || !procnames_start_line) {
 		    if (!ps.block_init) {
 			if (troff && !ps.dumped_decl_indent) {
 			    if (ps.want_blank)
 				*e_code++ = ' ';
 			    ps.want_blank = false;
 			    sprintf(e_code, "\n.De %dp+\200p\n", dec_ind * 7);
 			    ps.dumped_decl_indent = 1;
 			    e_code += strlen(e_code);
 			} else {
 			    int cur_dec_ind;
 			    int pos, startpos;
 
 			    /*
 			     * in order to get the tab math right for
 			     * indentations that are not multiples of 8 we
 			     * need to modify both startpos and dec_ind
 			     * (cur_dec_ind) here by eight minus the
 			     * remainder of the current starting column
 			     * divided by eight. This seems to be a
 			     * properly working fix
 			     */
 			    startpos = e_code - s_code;
 			    cur_dec_ind = dec_ind;
 			    pos = startpos;
 			    if ((ps.ind_level * ps.ind_size) % 8 != 0) {
 				pos += (ps.ind_level * ps.ind_size) % 8;
 				cur_dec_ind += (ps.ind_level * ps.ind_size) % 8;
 			    }
 
 			    if (tabs_to_var) {
 				while ((pos & ~7) + 8 <= cur_dec_ind) {
 				    CHECK_SIZE_CODE;
 				    *e_code++ = '\t';
 				    pos = (pos & ~7) + 8;
 				}
 			    }
 			    while (pos < cur_dec_ind) {
 				CHECK_SIZE_CODE;
 				*e_code++ = ' ';
 				pos++;
 			    }
 			    if (ps.want_blank && e_code - s_code == startpos)
 				*e_code++ = ' ';
 			    ps.want_blank = false;
 			}
 		    }
 		} else {
 		    if (ps.want_blank)
 			*e_code++ = ' ';
 		    ps.want_blank = false;
 		    if (dec_ind && s_code != e_code) {
 			*e_code = '\0';
 			dump_line();
 		    }
 		    dec_ind = 0;
 		}
 	    }
 	    else if (sp_sw && ps.p_l_follow == 0) {
 		sp_sw = false;
 		force_nl = true;
 		ps.last_u_d = true;
 		ps.in_stmt = false;
 		parse(hd_type);
 	    }
     copy_id:
 	    if (ps.want_blank)
 		*e_code++ = ' ';
 	    if (troff && ps.its_a_keyword) {
 		e_code = chfont(&bodyf, &keywordf, e_code);
 		for (t_ptr = token; *t_ptr; ++t_ptr) {
 		    CHECK_SIZE_CODE;
 		    *e_code++ = keywordf.allcaps && islower(*t_ptr)
 			? toupper(*t_ptr) : *t_ptr;
 		}
 		e_code = chfont(&keywordf, &bodyf, e_code);
 	    }
 	    else
 		for (t_ptr = token; *t_ptr; ++t_ptr) {
 		    CHECK_SIZE_CODE;
 		    *e_code++ = *t_ptr;
 		}
 	    ps.want_blank = true;
 	    break;
 
 	case period:		/* treat a period kind of like a binary
 				 * operation */
 	    *e_code++ = '.';	/* move the period into line */
 	    ps.want_blank = false;	/* dont put a blank after a period */
 	    break;
 
 	case comma:
 	    ps.want_blank = (s_code != e_code);	/* only put blank after comma
 						 * if comma does not start the
 						 * line */
 	    if (ps.in_decl && is_procname == 0 && !ps.block_init)
 		while ((e_code - s_code) < (dec_ind - 1)) {
 		    CHECK_SIZE_CODE;
 		    *e_code++ = ' ';
 		}
 
 	    *e_code++ = ',';
 	    if (ps.p_l_follow == 0) {
 		if (ps.block_init_level <= 0)
 		    ps.block_init = 0;
 		if (break_comma && (!ps.leave_comma || compute_code_target() + (e_code - s_code) > max_col - 8))
 		    force_nl = true;
 	    }
 	    break;
 
 	case preesc:		/* got the character '#' */
 	    if ((s_com != e_com) ||
 		    (s_lab != e_lab) ||
 		    (s_code != e_code))
 		dump_line();
 	    *e_lab++ = '#';	/* move whole line to 'label' buffer */
 	    {
 		int         in_comment = 0;
 		int         com_start = 0;
 		char        quote = 0;
 		int         com_end = 0;
 
 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {
 		    buf_ptr++;
 		    if (buf_ptr >= buf_end)
 			fill_buffer();
 		}
 		while (*buf_ptr != '\n' || (in_comment && !had_eof)) {
 		    CHECK_SIZE_LAB;
 		    *e_lab = *buf_ptr++;
 		    if (buf_ptr >= buf_end)
 			fill_buffer();
 		    switch (*e_lab++) {
 		    case BACKSLASH:
 			if (troff)
 			    *e_lab++ = BACKSLASH;
 			if (!in_comment) {
 			    *e_lab++ = *buf_ptr++;
 			    if (buf_ptr >= buf_end)
 				fill_buffer();
 			}
 			break;
 		    case '/':
 			if (*buf_ptr == '*' && !in_comment && !quote) {
 			    in_comment = 1;
 			    *e_lab++ = *buf_ptr++;
 			    com_start = e_lab - s_lab - 2;
 			}
 			break;
 		    case '"':
 			if (quote == '"')
 			    quote = 0;
 			break;
 		    case '\'':
 			if (quote == '\'')
 			    quote = 0;
 			break;
 		    case '*':
 			if (*buf_ptr == '/' && in_comment) {
 			    in_comment = 0;
 			    *e_lab++ = *buf_ptr++;
 			    com_end = e_lab - s_lab;
 			}
 			break;
 		    }
 		}
 
 		while (e_lab > s_lab && (e_lab[-1] == ' ' || e_lab[-1] == '\t'))
 		    e_lab--;
-		if (e_lab - s_lab == com_end && bp_save == 0) {	/* comment on
-								 * preprocessor line */
-		    if (sc_end == 0)	/* if this is the first comment, we
+		/* comment on preprocessor line */
+		if (e_lab - s_lab == com_end && bp_save == NULL) {
+		    if (sc_end == NULL)	/* if this is the first comment, we
 					 * must set up the buffer */
 			sc_end = &(save_com[0]);
 		    else {
 			*sc_end++ = '\n';	/* add newline between
 						 * comments */
 			*sc_end++ = ' ';
 			--line_no;
 		    }
 		    bcopy(s_lab + com_start, sc_end, com_end - com_start);
 		    sc_end += com_end - com_start;
 		    if (sc_end >= &save_com[sc_size])
 			abort();
 		    e_lab = s_lab + com_start;
 		    while (e_lab > s_lab && (e_lab[-1] == ' ' || e_lab[-1] == '\t'))
 			e_lab--;
 		    bp_save = buf_ptr;	/* save current input buffer */
 		    be_save = buf_end;
 		    buf_ptr = save_com;	/* fix so that subsequent calls to
 					 * lexi will take tokens out of
 					 * save_com */
 		    *sc_end++ = ' ';	/* add trailing blank, just in case */
 		    buf_end = sc_end;
-		    sc_end = 0;
+		    sc_end = NULL;
 		}
 		*e_lab = '\0';	/* null terminate line */
 		ps.pcase = false;
 	    }
 
 	    if (strncmp(s_lab, "#if", 3) == 0) {
 		if (blanklines_around_conditional_compilation) {
 		    int c;
 		    prefix_blankline_requested++;
 		    while ((c = getc(input)) == '\n');
 		    ungetc(c, input);
 		}
 		if ((size_t)ifdef_level < sizeof(state_stack)/sizeof(state_stack[0])) {
 		    match_state[ifdef_level].tos = -1;
 		    state_stack[ifdef_level++] = ps;
 		}
 		else
 		    diag2(1, "#if stack overflow");
 	    }
 	    else if (strncmp(s_lab, "#else", 5) == 0)
 		if (ifdef_level <= 0)
 		    diag2(1, "Unmatched #else");
 		else {
 		    match_state[ifdef_level - 1] = ps;
 		    ps = state_stack[ifdef_level - 1];
 		}
 	    else if (strncmp(s_lab, "#endif", 6) == 0) {
 		if (ifdef_level <= 0)
 		    diag2(1, "Unmatched #endif");
 		else {
 		    ifdef_level--;
 
 #ifdef undef
 		    /*
 		     * This match needs to be more intelligent before the
 		     * message is useful
 		     */
 		    if (match_state[ifdef_level].tos >= 0
 			  && bcmp(&ps, &match_state[ifdef_level], sizeof ps))
 			diag2(0, "Syntactically inconsistent #ifdef alternatives");
 #endif
 		}
 		if (blanklines_around_conditional_compilation) {
 		    postfix_blankline_requested++;
 		    n_real_blanklines = 0;
 		}
 	    }
 	    break;		/* subsequent processing of the newline
 				 * character will cause the line to be printed */
 
 	case comment:		/* we have gotten a / followed by * this is a biggie */
 	    if (flushed_nl) {	/* we should force a broken line here */
 		flushed_nl = false;
 		dump_line();
 		ps.want_blank = false;	/* dont insert blank at line start */
 		force_nl = false;
 	    }
 	    pr_comment();
 	    break;
 	}			/* end of big switch stmt */
 
 	*e_code = '\0';		/* make sure code section is null terminated */
 	if (type_code != comment && type_code != newline && type_code != preesc)
 	    ps.last_token = type_code;
     }				/* end of main while (1) loop */
 }
 
 /*
  * copy input file to backup file if in_name is /blah/blah/blah/file, then
  * backup file will be ".Bfile" then make the backup file the input and
  * original input file the output
  */
 static void
 bakcopy(void)
 {
     int         n,
                 bakchn;
     char        buff[8 * 1024];
     const char *p;
 
     /* construct file name .Bfile */
     for (p = in_name; *p; p++);	/* skip to end of string */
     while (p > in_name && *p != '/')	/* find last '/' */
 	p--;
     if (*p == '/')
 	p++;
     sprintf(bakfile, "%s.BAK", p);
 
     /* copy in_name to backup file */
     bakchn = creat(bakfile, 0600);
     if (bakchn < 0)
 	err(1, "%s", bakfile);
     while ((n = read(fileno(input), buff, sizeof(buff))) > 0)
 	if (write(bakchn, buff, n) != n)
 	    err(1, "%s", bakfile);
     if (n < 0)
 	err(1, "%s", in_name);
     close(bakchn);
     fclose(input);
 
     /* re-open backup file as the input file */
     input = fopen(bakfile, "r");
     if (input == NULL)
 	err(1, "%s", bakfile);
     /* now the original input file will be the output */
     output = fopen(in_name, "w");
     if (output == NULL) {
 	unlink(bakfile);
 	err(1, "%s", in_name);
     }
 }
Index: user/alc/PQ_LAUNDRY/usr.bin/indent/io.c
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/indent/io.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/usr.bin/indent/io.c	(revision 303517)
@@ -1,668 +1,668 @@
 /*
  * Copyright (c) 1985 Sun Microsystems, Inc.
  * Copyright (c) 1980, 1993
  *	The Regents of the University of California.  All rights reserved.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)io.c	8.1 (Berkeley) 6/6/93";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <ctype.h>
 #include <err.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "indent_globs.h"
 #include "indent.h"
 
 int         comment_open;
 static int  paren_target;
 static int pad_output(int current, int target);
 
 void
 dump_line(void)
 {				/* dump_line is the routine that actually
 				 * effects the printing of the new source. It
 				 * prints the label section, followed by the
 				 * code section with the appropriate nesting
 				 * level, followed by any comments */
     int cur_col,
                 target_col = 1;
     static int  not_first_line;
 
     if (ps.procname[0]) {
 	if (troff) {
 	    if (comment_open) {
 		comment_open = 0;
 		fprintf(output, ".*/\n");
 	    }
 	    fprintf(output, ".Pr \"%s\"\n", ps.procname);
 	}
 	ps.ind_level = 0;
 	ps.procname[0] = 0;
     }
     if (s_code == e_code && s_lab == e_lab && s_com == e_com) {
 	if (suppress_blanklines > 0)
 	    suppress_blanklines--;
 	else {
 	    ps.bl_line = true;
 	    n_real_blanklines++;
 	}
     }
     else if (!inhibit_formatting) {
 	suppress_blanklines = 0;
 	ps.bl_line = false;
 	if (prefix_blankline_requested && not_first_line) {
 	    if (swallow_optional_blanklines) {
 		if (n_real_blanklines == 1)
 		    n_real_blanklines = 0;
 	    }
 	    else {
 		if (n_real_blanklines == 0)
 		    n_real_blanklines = 1;
 	    }
 	}
 	while (--n_real_blanklines >= 0)
 	    putc('\n', output);
 	n_real_blanklines = 0;
 	if (ps.ind_level == 0)
 	    ps.ind_stmt = 0;	/* this is a class A kludge. dont do
 				 * additional statement indentation if we are
 				 * at bracket level 0 */
 
 	if (e_lab != s_lab || e_code != s_code)
 	    ++code_lines;	/* keep count of lines with code */
 
 
 	if (e_lab != s_lab) {	/* print lab, if any */
 	    if (comment_open) {
 		comment_open = 0;
 		fprintf(output, ".*/\n");
 	    }
 	    while (e_lab > s_lab && (e_lab[-1] == ' ' || e_lab[-1] == '\t'))
 		e_lab--;
 	    *e_lab = '\0';
 	    cur_col = pad_output(1, compute_label_target());
 	    if (s_lab[0] == '#' && (strncmp(s_lab, "#else", 5) == 0
 				    || strncmp(s_lab, "#endif", 6) == 0)) {
 		char *s = s_lab;
 		if (e_lab[-1] == '\n') e_lab--;
 		do putc(*s++, output);
 		while (s < e_lab && 'a' <= *s && *s<='z');
 		while ((*s == ' ' || *s == '\t') && s < e_lab)
 		    s++;
 		if (s < e_lab)
 		    fprintf(output, s[0]=='/' && s[1]=='*' ? "\t%.*s" : "\t/* %.*s */",
 			    (int)(e_lab - s), s);
 	    }
 	    else fprintf(output, "%.*s", (int)(e_lab - s_lab), s_lab);
 	    cur_col = count_spaces(cur_col, s_lab);
 	}
 	else
 	    cur_col = 1;	/* there is no label section */
 
 	ps.pcase = false;
 
 	if (s_code != e_code) {	/* print code section, if any */
 	    char *p;
 
 	    if (comment_open) {
 		comment_open = 0;
 		fprintf(output, ".*/\n");
 	    }
 	    target_col = compute_code_target();
 	    {
 		int i;
 
 		for (i = 0; i < ps.p_l_follow; i++)
 		    if (ps.paren_indents[i] >= 0)
 			ps.paren_indents[i] = -(ps.paren_indents[i] + target_col);
 	    }
 	    cur_col = pad_output(cur_col, target_col);
 	    for (p = s_code; p < e_code; p++)
 		if (*p == (char) 0200)
 		    fprintf(output, "%d", target_col * 7);
 		else
 		    putc(*p, output);
 	    cur_col = count_spaces(cur_col, s_code);
 	}
 	if (s_com != e_com) {
 	    if (troff) {
 		int         all_here = 0;
 		char *p;
 
 		if (e_com[-1] == '/' && e_com[-2] == '*')
 		    e_com -= 2, all_here++;
 		while (e_com > s_com && e_com[-1] == ' ')
 		    e_com--;
 		*e_com = 0;
 		p = s_com;
 		while (*p == ' ')
 		    p++;
 		if (p[0] == '/' && p[1] == '*')
 		    p += 2, all_here++;
 		else if (p[0] == '*')
 		    p += p[1] == '/' ? 2 : 1;
 		while (*p == ' ')
 		    p++;
 		if (*p == 0)
 		    goto inhibit_newline;
 		if (comment_open < 2 && ps.box_com) {
 		    comment_open = 0;
 		    fprintf(output, ".*/\n");
 		}
 		if (comment_open == 0) {
 		    if ('a' <= *p && *p <= 'z')
 			*p = *p + 'A' - 'a';
 		    if (e_com - p < 50 && all_here == 2) {
 			char *follow = p;
 			fprintf(output, "\n.nr C! \\w\1");
 			while (follow < e_com) {
 			    switch (*follow) {
 			    case '\n':
 				putc(' ', output);
 			    case 1:
 				break;
 			    case '\\':
 				putc('\\', output);
 			    default:
 				putc(*follow, output);
 			    }
 			    follow++;
 			}
 			putc(1, output);
 		    }
 		    fprintf(output, "\n./* %dp %d %dp\n",
 			    ps.com_col * 7,
 			    (s_code != e_code || s_lab != e_lab) - ps.box_com,
 			    target_col * 7);
 		}
 		comment_open = 1 + ps.box_com;
 		while (*p) {
 		    if (*p == BACKSLASH)
 			putc(BACKSLASH, output);
 		    putc(*p++, output);
 		}
 	    }
 	    else {		/* print comment, if any */
 		int target = ps.com_col;
 		char *com_st = s_com;
 
 		target += ps.comment_delta;
 		while (*com_st == '\t')
 		    com_st++, target += 8;	/* ? */
 		while (target <= 0)
 		    if (*com_st == ' ')
 			target++, com_st++;
 		    else if (*com_st == '\t')
 			target = ((target - 1) & ~7) + 9, com_st++;
 		    else
 			target = 1;
 		if (cur_col > target) {	/* if comment can't fit on this line,
 					 * put it on next line */
 		    putc('\n', output);
 		    cur_col = 1;
 		    ++ps.out_lines;
 		}
 		while (e_com > com_st && isspace(e_com[-1]))
 		    e_com--;
 		cur_col = pad_output(cur_col, target);
 		if (!ps.box_com) {
 		    if (star_comment_cont && (com_st[1] != '*' || e_com <= com_st + 1)) {
 			if (com_st[1] == ' ' && com_st[0] == ' ' && e_com > com_st + 1)
 			    com_st[1] = '*';
 			else
 			    fwrite(" * ", com_st[0] == '\t' ? 2 : com_st[0] == '*' ? 1 : 3, 1, output);
 		    }
 		}
 		fwrite(com_st, e_com - com_st, 1, output);
 		ps.comment_delta = ps.n_comment_delta;
 		cur_col = count_spaces(cur_col, com_st);
 		++ps.com_lines;	/* count lines with comments */
 	    }
 	}
 	if (ps.use_ff)
 	    putc('\014', output);
 	else
 	    putc('\n', output);
 inhibit_newline:
 	++ps.out_lines;
 	if (ps.just_saw_decl == 1 && blanklines_after_declarations) {
 	    prefix_blankline_requested = 1;
 	    ps.just_saw_decl = 0;
 	}
 	else
 	    prefix_blankline_requested = postfix_blankline_requested;
 	postfix_blankline_requested = 0;
     }
     ps.decl_on_line = ps.in_decl;	/* if we are in the middle of a
 					 * declaration, remember that fact for
 					 * proper comment indentation */
     ps.ind_stmt = ps.in_stmt & ~ps.in_decl;	/* next line should be
 						 * indented if we have not
 						 * completed this stmt and if
 						 * we are not in the middle of
 						 * a declaration */
     ps.use_ff = false;
     ps.dumped_decl_indent = 0;
     *(e_lab = s_lab) = '\0';	/* reset buffers */
     *(e_code = s_code) = '\0';
     *(e_com = s_com) = '\0';
     ps.ind_level = ps.i_l_follow;
     ps.paren_level = ps.p_l_follow;
     paren_target = -ps.paren_indents[ps.paren_level - 1];
     not_first_line = 1;
 }
 
 int
 compute_code_target(void)
 {
     int target_col = ps.ind_size * ps.ind_level + 1;
 
     if (ps.paren_level)
 	if (!lineup_to_parens)
 	    target_col += continuation_indent
 		* (2 * continuation_indent == ps.ind_size ? 1 : ps.paren_level);
 	else {
 	    int w;
 	    int t = paren_target;
 
 	    if ((w = count_spaces(t, s_code) - max_col) > 0
 		    && count_spaces(target_col, s_code) <= max_col) {
 		t -= w + 1;
 		if (t > target_col)
 		    target_col = t;
 	    }
 	    else
 		target_col = t;
 	}
     else if (ps.ind_stmt)
 	target_col += continuation_indent;
     return target_col;
 }
 
 int
 compute_label_target(void)
 {
     return
 	ps.pcase ? (int) (case_ind * ps.ind_size) + 1
 	: *s_lab == '#' ? 1
 	: ps.ind_size * (ps.ind_level - label_offset) + 1;
 }
 
 
 /*
  * Copyright (C) 1976 by the Board of Trustees of the University of Illinois
  *
  * All rights reserved
  *
  *
  * NAME: fill_buffer
  *
  * FUNCTION: Reads one block of input into input_buffer
  *
  * HISTORY: initial coding 	November 1976	D A Willcox of CAC 1/7/77 A
  * Willcox of CAC	Added check for switch back to partly full input
  * buffer from temporary buffer
  *
  */
 void
 fill_buffer(void)
 {				/* this routine reads stuff from the input */
     char *p;
     int i;
     FILE *f = input;
 
-    if (bp_save != 0) {		/* there is a partly filled input buffer left */
+    if (bp_save != NULL) {		/* there is a partly filled input buffer left */
 	buf_ptr = bp_save;	/* dont read anything, just switch buffers */
 	buf_end = be_save;
-	bp_save = be_save = 0;
+	bp_save = be_save = NULL;
 	if (buf_ptr < buf_end)
 	    return;		/* only return if there is really something in
 				 * this buffer */
     }
     for (p = in_buffer;;) {
 	if (p >= in_buffer_limit) {
 	    int size = (in_buffer_limit - in_buffer) * 2 + 10;
 	    int offset = p - in_buffer;
 	    in_buffer = realloc(in_buffer, size);
 	    if (in_buffer == NULL)
 		errx(1, "input line too long");
 	    p = in_buffer + offset;
 	    in_buffer_limit = in_buffer + size - 2;
 	}
 	if ((i = getc(f)) == EOF) {
 		*p++ = ' ';
 		*p++ = '\n';
 		had_eof = true;
 		break;
 	}
 	*p++ = i;
 	if (i == '\n')
 		break;
     }
     buf_ptr = in_buffer;
     buf_end = p;
     if (p[-2] == '/' && p[-3] == '*') {
 	if (in_buffer[3] == 'I' && strncmp(in_buffer, "/**INDENT**", 11) == 0)
 	    fill_buffer();	/* flush indent error message */
 	else {
 	    int         com = 0;
 
 	    p = in_buffer;
 	    while (*p == ' ' || *p == '\t')
 		p++;
 	    if (*p == '/' && p[1] == '*') {
 		p += 2;
 		while (*p == ' ' || *p == '\t')
 		    p++;
 		if (p[0] == 'I' && p[1] == 'N' && p[2] == 'D' && p[3] == 'E'
 			&& p[4] == 'N' && p[5] == 'T') {
 		    p += 6;
 		    while (*p == ' ' || *p == '\t')
 			p++;
 		    if (*p == '*')
 			com = 1;
 		    else if (*p == 'O') {
 			if (*++p == 'N')
 			    p++, com = 1;
 			else if (*p == 'F' && *++p == 'F')
 			    p++, com = 2;
 		    }
 		    while (*p == ' ' || *p == '\t')
 			p++;
 		    if (p[0] == '*' && p[1] == '/' && p[2] == '\n' && com) {
 			if (s_com != e_com || s_lab != e_lab || s_code != e_code)
 			    dump_line();
 			if (!(inhibit_formatting = com - 1)) {
 			    n_real_blanklines = 0;
 			    postfix_blankline_requested = 0;
 			    prefix_blankline_requested = 0;
 			    suppress_blanklines = 1;
 			}
 		    }
 		}
 	    }
 	}
     }
     if (inhibit_formatting) {
 	p = in_buffer;
 	do
 	    putc(*p, output);
 	while (*p++ != '\n');
     }
 }
 
 /*
  * Copyright (C) 1976 by the Board of Trustees of the University of Illinois
  *
  * All rights reserved
  *
  *
  * NAME: pad_output
  *
  * FUNCTION: Writes tabs and spaces to move the current column up to the desired
  * position.
  *
  * ALGORITHM: Put tabs and/or blanks into pobuf, then write pobuf.
  *
  * PARAMETERS: current		integer		The current column target
  * nteger		The desired column
  *
  * RETURNS: Integer value of the new column.  (If current >= target, no action is
  * taken, and current is returned.
  *
  * GLOBALS: None
  *
  * CALLS: write (sys)
  *
  * CALLED BY: dump_line
  *
  * HISTORY: initial coding 	November 1976	D A Willcox of CAC
  *
  */
 static int
 pad_output(int current, int target)
 			        /* writes tabs and blanks (if necessary) to
 				 * get the current output position up to the
 				 * target column */
     /* current: the current column value */
     /* target: position we want it at */
 {
     int curr;		/* internal column pointer */
     int tcur;
 
     if (troff)
 	fprintf(output, "\\h'|%dp'", (target - 1) * 7);
     else {
 	if (current >= target)
 	    return (current);	/* line is already long enough */
 	curr = current;
         if (use_tabs) {
             while ((tcur = ((curr - 1) & tabmask) + tabsize + 1) <= target) {
                 putc('\t', output);
                 curr = tcur;
             }
         }
         while (curr++ < target)
 	    putc(' ', output);	/* pad with final blanks */
     }
     return (target);
 }
 
 /*
  * Copyright (C) 1976 by the Board of Trustees of the University of Illinois
  *
  * All rights reserved
  *
  *
  * NAME: count_spaces
  *
  * FUNCTION: Find out where printing of a given string will leave the current
  * character position on output.
  *
  * ALGORITHM: Run thru input string and add appropriate values to current
  * position.
  *
  * RETURNS: Integer value of position after printing "buffer" starting in column
  * "current".
  *
  * HISTORY: initial coding 	November 1976	D A Willcox of CAC
  *
  */
 int
 count_spaces(int current, char *buffer)
 /*
  * this routine figures out where the character position will be after
  * printing the text in buffer starting at column "current"
  */
 {
     char *buf;		/* used to look thru buffer */
     int cur;		/* current character counter */
 
     cur = current;
 
     for (buf = buffer; *buf != '\0'; ++buf) {
 	switch (*buf) {
 
 	case '\n':
 	case 014:		/* form feed */
 	    cur = 1;
 	    break;
 
 	case '\t':
 	    cur = ((cur - 1) & tabmask) + tabsize + 1;
 	    break;
 
 	case 010:		/* backspace */
 	    --cur;
 	    break;
 
 	default:
 	    ++cur;
 	    break;
 	}			/* end of switch */
     }				/* end of for loop */
     return (cur);
 }
 
 void
 diag4(int level, const char *msg, int a, int b)
 {
     if (level)
 	found_err = 1;
     if (output == stdout) {
 	fprintf(stdout, "/**INDENT** %s@%d: ", level == 0 ? "Warning" : "Error", line_no);
 	fprintf(stdout, msg, a, b);
 	fprintf(stdout, " */\n");
     }
     else {
 	fprintf(stderr, "%s@%d: ", level == 0 ? "Warning" : "Error", line_no);
 	fprintf(stderr, msg, a, b);
 	fprintf(stderr, "\n");
     }
 }
 
 void
 diag3(int level, const char *msg, int a)
 {
     if (level)
 	found_err = 1;
     if (output == stdout) {
 	fprintf(stdout, "/**INDENT** %s@%d: ", level == 0 ? "Warning" : "Error", line_no);
 	fprintf(stdout, msg, a);
 	fprintf(stdout, " */\n");
     }
     else {
 	fprintf(stderr, "%s@%d: ", level == 0 ? "Warning" : "Error", line_no);
 	fprintf(stderr, msg, a);
 	fprintf(stderr, "\n");
     }
 }
 
 void
 diag2(int level, const char *msg)
 {
     if (level)
 	found_err = 1;
     if (output == stdout) {
 	fprintf(stdout, "/**INDENT** %s@%d: ", level == 0 ? "Warning" : "Error", line_no);
 	fprintf(stdout, "%s", msg);
 	fprintf(stdout, " */\n");
     }
     else {
 	fprintf(stderr, "%s@%d: ", level == 0 ? "Warning" : "Error", line_no);
 	fprintf(stderr, "%s", msg);
 	fprintf(stderr, "\n");
     }
 }
 
 void
 writefdef(struct fstate *f, int nm)
 {
     fprintf(output, ".ds f%c %s\n.nr s%c %d\n",
 	    nm, f->font, nm, f->size);
 }
 
 char *
 chfont(struct fstate *of, struct fstate *nf, char *s)
 {
     if (of->font[0] != nf->font[0]
 	    || of->font[1] != nf->font[1]) {
 	*s++ = '\\';
 	*s++ = 'f';
 	if (nf->font[1]) {
 	    *s++ = '(';
 	    *s++ = nf->font[0];
 	    *s++ = nf->font[1];
 	}
 	else
 	    *s++ = nf->font[0];
     }
     if (nf->size != of->size) {
 	*s++ = '\\';
 	*s++ = 's';
 	if (nf->size < of->size) {
 	    *s++ = '-';
 	    *s++ = '0' + of->size - nf->size;
 	}
 	else {
 	    *s++ = '+';
 	    *s++ = '0' + nf->size - of->size;
 	}
     }
     return s;
 }
 
 void
 parsefont(struct fstate *f, const char *s0)
 {
     const char *s = s0;
     int         sizedelta = 0;
 
     bzero(f, sizeof *f);
     while (*s) {
 	if (isdigit(*s))
 	    f->size = f->size * 10 + *s - '0';
 	else if (isupper(*s))
 	    if (f->font[0])
 		f->font[1] = *s;
 	    else
 		f->font[0] = *s;
 	else if (*s == 'c')
 	    f->allcaps = 1;
 	else if (*s == '+')
 	    sizedelta++;
 	else if (*s == '-')
 	    sizedelta--;
 	else {
 	    errx(1, "bad font specification: %s", s0);
 	}
 	s++;
     }
     if (f->font[0] == 0)
 	f->font[0] = 'R';
     if (bodyf.size == 0)
 	bodyf.size = 11;
     if (f->size == 0)
 	f->size = bodyf.size + sizedelta;
     else if (sizedelta > 0)
 	f->size += bodyf.size;
     else
 	f->size = bodyf.size - f->size;
 }
Index: user/alc/PQ_LAUNDRY/usr.bin/indent/lexi.c
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/indent/lexi.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/usr.bin/indent/lexi.c	(revision 303517)
@@ -1,606 +1,606 @@
 /*
  * Copyright (c) 1985 Sun Microsystems, Inc.
  * Copyright (c) 1980, 1993
  *	The Regents of the University of California.  All rights reserved.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
 #endif /* not lint */
 #endif
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Here we have the token scanner for indent.  It scans off one token and puts
  * it in the global variable "token".  It returns a code, indicating the type
  * of token scanned.
  */
 
 #include <err.h>
 #include <stdio.h>
 #include <ctype.h>
 #include <stdlib.h>
 #include <string.h>
 #include "indent_globs.h"
 #include "indent_codes.h"
 #include "indent.h"
 
 #define alphanum 1
 #define opchar 3
 
 struct templ {
     const char *rwd;
     int         rwcode;
 };
 
 struct templ specials[1000] =
 {
     {"switch", 1},
     {"case", 2},
     {"break", 0},
     {"struct", 3},
     {"union", 3},
     {"enum", 3},
     {"default", 2},
     {"int", 4},
     {"char", 4},
     {"float", 4},
     {"double", 4},
     {"long", 4},
     {"short", 4},
     {"typedef", 4},
     {"unsigned", 4},
     {"register", 4},
     {"static", 4},
     {"global", 4},
     {"extern", 4},
     {"void", 4},
     {"const", 4},
     {"volatile", 4},
     {"goto", 0},
     {"return", 0},
     {"if", 5},
     {"while", 5},
     {"for", 5},
     {"else", 6},
     {"do", 6},
     {"sizeof", 7},
     {0, 0}
 };
 
 char        chartype[128] =
 {				/* this is used to facilitate the decision of
 				 * what type (alphanumeric, operator) each
 				 * character is */
     0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0,
     0, 3, 0, 0, 1, 3, 3, 0,
     0, 0, 3, 3, 0, 3, 0, 3,
     1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 0, 0, 3, 3, 3, 3,
     0, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 0, 0, 0, 3, 1,
     0, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 0, 3, 0, 3, 0
 };
 
 int
 lexi(void)
 {
     int         unary_delim;	/* this is set to 1 if the current token
 				 * forces a following operator to be unary */
     static int  last_code;	/* the last token type returned */
     static int  l_struct;	/* set to 1 if the last token was 'struct' */
     int         code;		/* internal code to be returned */
     char        qchar;		/* the delimiter character for a string */
 
     e_token = s_token;		/* point to start of place to save token */
     unary_delim = false;
     ps.col_1 = ps.last_nl;	/* tell world that this token started in
 				 * column 1 iff the last thing scanned was nl */
     ps.last_nl = false;
 
     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
 	ps.col_1 = false;	/* leading blanks imply token is not in column
 				 * 1 */
 	if (++buf_ptr >= buf_end)
 	    fill_buffer();
     }
 
     /* Scan an alphanumeric token */
     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 	/*
 	 * we have a character or number
 	 */
 	const char *j;		/* used for searching thru list of
 				 *
 				 * reserved words */
 	struct templ *p;
 
 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 	    int         seendot = 0,
 	                seenexp = 0,
 			seensfx = 0;
 	    if (*buf_ptr == '0' &&
 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 		*e_token++ = *buf_ptr++;
 		*e_token++ = *buf_ptr++;
 		while (isxdigit(*buf_ptr)) {
 		    CHECK_SIZE_TOKEN;
 		    *e_token++ = *buf_ptr++;
 		}
 	    }
 	    else
 		while (1) {
 		    if (*buf_ptr == '.') {
 			if (seendot)
 			    break;
 			else
 			    seendot++;
 		    }
 		    CHECK_SIZE_TOKEN;
 		    *e_token++ = *buf_ptr++;
 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 			    break;
 			else {
 			    seenexp++;
 			    seendot++;
 			    CHECK_SIZE_TOKEN;
 			    *e_token++ = *buf_ptr++;
 			    if (*buf_ptr == '+' || *buf_ptr == '-')
 				*e_token++ = *buf_ptr++;
 			}
 		    }
 		}
 	    while (1) {
 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 		    CHECK_SIZE_TOKEN;
 		    *e_token++ = *buf_ptr++;
 		    seensfx |= 1;
 		    continue;
 		}
 		if (!(seensfx & 2) && strchr("fFlL", *buf_ptr)) {
 		    CHECK_SIZE_TOKEN;
 		    if (buf_ptr[1] == buf_ptr[0])
 		        *e_token++ = *buf_ptr++;
 		    *e_token++ = *buf_ptr++;
 		    seensfx |= 2;
 		    continue;
 		}
 		break;
 	    }
 	}
 	else
 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 		/* fill_buffer() terminates buffer with newline */
 		if (*buf_ptr == BACKSLASH) {
 		    if (*(buf_ptr + 1) == '\n') {
 			buf_ptr += 2;
 			if (buf_ptr >= buf_end)
 			    fill_buffer();
 			} else
 			    break;
 		}
 		CHECK_SIZE_TOKEN;
 		/* copy it over */
 		*e_token++ = *buf_ptr++;
 		if (buf_ptr >= buf_end)
 		    fill_buffer();
 	    }
 	*e_token++ = '\0';
 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
 	    if (++buf_ptr >= buf_end)
 		fill_buffer();
 	}
 	ps.its_a_keyword = false;
 	ps.sizeof_keyword = false;
 	if (l_struct && !ps.p_l_follow) {
 				/* if last token was 'struct' and we're not
 				 * in parentheses, then this token
 				 * should be treated as a declaration */
 	    l_struct = false;
 	    last_code = ident;
 	    ps.last_u_d = true;
 	    return (decl);
 	}
 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
 				 * unless last token was 'struct' */
 	l_struct = false;
 	last_code = ident;	/* Remember that this is the code we will
 				 * return */
 
 	if (auto_typedefs) {
 	    const char *q = s_token;
 	    size_t q_len = strlen(q);
 	    /* Check if we have an "_t" in the end */
 	    if (q_len > 2 &&
 	        (strcmp(q + q_len - 2, "_t") == 0)) {
 	        ps.its_a_keyword = true;
 		ps.last_u_d = true;
 	        goto found_auto_typedef;
 	    }
 	}
 
 	/*
 	 * This loop will check if the token is a keyword.
 	 */
-	for (p = specials; (j = p->rwd) != 0; p++) {
+	for (p = specials; (j = p->rwd) != NULL; p++) {
 	    const char *q = s_token;	/* point at scanned token */
 	    if (*j++ != *q++ || *j++ != *q++)
 		continue;	/* This test depends on the fact that
 				 * identifiers are always at least 1 character
 				 * long (ie. the first two bytes of the
 				 * identifier are always meaningful) */
 	    if (q[-1] == 0)
 		break;		/* If its a one-character identifier */
 	    while (*q++ == *j)
 		if (*j++ == 0)
 		    goto found_keyword;	/* I wish that C had a multi-level
 					 * break... */
 	}
 	if (p->rwd) {		/* we have a keyword */
     found_keyword:
 	    ps.its_a_keyword = true;
 	    ps.last_u_d = true;
 	    switch (p->rwcode) {
 	    case 1:		/* it is a switch */
 		return (swstmt);
 	    case 2:		/* a case or default */
 		return (casestmt);
 
 	    case 3:		/* a "struct" */
 		/*
 		 * Next time around, we will want to know that we have had a
 		 * 'struct'
 		 */
 		l_struct = true;
 		/* FALLTHROUGH */
 
 	    case 4:		/* one of the declaration keywords */
 	    found_auto_typedef:
 		if (ps.p_l_follow) {
 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 		    break;	/* inside parens: cast, param list or sizeof */
 		}
 		last_code = decl;
 		return (decl);
 
 	    case 5:		/* if, while, for */
 		return (sp_paren);
 
 	    case 6:		/* do, else */
 		return (sp_nparen);
 
 	    case 7:
 		ps.sizeof_keyword = true;
 	    default:		/* all others are treated like any other
 				 * identifier */
 		return (ident);
 	    }			/* end of switch */
 	}			/* end of if (found_it) */
 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 	    char *tp = buf_ptr;
 	    while (tp < buf_end)
 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 		    goto not_proc;
 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
 	    ps.in_parameter_declaration = 1;
 	    rparen_count = 1;
     not_proc:;
 	}
 	/*
 	 * The following hack attempts to guess whether or not the current
 	 * token is in fact a declaration keyword -- one that has been
 	 * typedefd
 	 */
 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 		&& !ps.p_l_follow
 	        && !ps.block_init
 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
 		    ps.last_token == decl ||
 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
 	    ps.its_a_keyword = true;
 	    ps.last_u_d = true;
 	    last_code = decl;
 	    return decl;
 	}
 	if (last_code == decl)	/* if this is a declared variable, then
 				 * following sign is unary */
 	    ps.last_u_d = true;	/* will make "int a -1" work */
 	last_code = ident;
 	return (ident);		/* the ident is not in the list */
     }				/* end of procesing for alpanum character */
 
     /* Scan a non-alphanumeric token */
 
     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
 				 * moved here */
     *e_token = '\0';
     if (++buf_ptr >= buf_end)
 	fill_buffer();
 
     switch (*token) {
     case '\n':
 	unary_delim = ps.last_u_d;
 	ps.last_nl = true;	/* remember that we just had a newline */
 	code = (had_eof ? 0 : newline);
 
 	/*
 	 * if data has been exhausted, the newline is a dummy, and we should
 	 * return code to stop
 	 */
 	break;
 
     case '\'':			/* start of quoted character */
     case '"':			/* start of string */
 	qchar = *token;
 	if (troff) {
 	    e_token[-1] = '`';
 	    if (qchar == '"')
 		*e_token++ = '`';
 	    e_token = chfont(&bodyf, &stringf, e_token);
 	}
 	do {			/* copy the string */
 	    while (1) {		/* move one character or [/<char>]<char> */
 		if (*buf_ptr == '\n') {
 		    diag2(1, "Unterminated literal");
 		    goto stop_lit;
 		}
 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
 					 * since CHECK_SIZE guarantees that there
 					 * are at least 5 entries left */
 		*e_token = *buf_ptr++;
 		if (buf_ptr >= buf_end)
 		    fill_buffer();
 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
 		    if (*buf_ptr == '\n')	/* check for escaped newline */
 			++line_no;
 		    if (troff) {
 			*++e_token = BACKSLASH;
 			if (*buf_ptr == BACKSLASH)
 			    *++e_token = BACKSLASH;
 		    }
 		    *++e_token = *buf_ptr++;
 		    ++e_token;	/* we must increment this again because we
 				 * copied two chars */
 		    if (buf_ptr >= buf_end)
 			fill_buffer();
 		}
 		else
 		    break;	/* we copied one character */
 	    }			/* end of while (1) */
 	} while (*e_token++ != qchar);
 	if (troff) {
 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
 	    if (qchar == '"')
 		*e_token++ = '\'';
 	}
 stop_lit:
 	code = ident;
 	break;
 
     case ('('):
     case ('['):
 	unary_delim = true;
 	code = lparen;
 	break;
 
     case (')'):
     case (']'):
 	code = rparen;
 	break;
 
     case '#':
 	unary_delim = ps.last_u_d;
 	code = preesc;
 	break;
 
     case '?':
 	unary_delim = true;
 	code = question;
 	break;
 
     case (':'):
 	code = colon;
 	unary_delim = true;
 	break;
 
     case (';'):
 	unary_delim = true;
 	code = semicolon;
 	break;
 
     case ('{'):
 	unary_delim = true;
 
 	/*
 	 * if (ps.in_or_st) ps.block_init = 1;
 	 */
 	/* ?	code = ps.block_init ? lparen : lbrace; */
 	code = lbrace;
 	break;
 
     case ('}'):
 	unary_delim = true;
 	/* ?	code = ps.block_init ? rparen : rbrace; */
 	code = rbrace;
 	break;
 
     case 014:			/* a form feed */
 	unary_delim = ps.last_u_d;
 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
 				 * right */
 	code = form_feed;
 	break;
 
     case (','):
 	unary_delim = true;
 	code = comma;
 	break;
 
     case '.':
 	unary_delim = false;
 	code = period;
 	break;
 
     case '-':
     case '+':			/* check for -, +, --, ++ */
 	code = (ps.last_u_d ? unary_op : binary_op);
 	unary_delim = true;
 
 	if (*buf_ptr == token[0]) {
 	    /* check for doubled character */
 	    *e_token++ = *buf_ptr++;
 	    /* buffer overflow will be checked at end of loop */
 	    if (last_code == ident || last_code == rparen) {
 		code = (ps.last_u_d ? unary_op : postop);
 		/* check for following ++ or -- */
 		unary_delim = false;
 	    }
 	}
 	else if (*buf_ptr == '=')
 	    /* check for operator += */
 	    *e_token++ = *buf_ptr++;
 	else if (*buf_ptr == '>') {
 	    /* check for operator -> */
 	    *e_token++ = *buf_ptr++;
 	    if (!pointer_as_binop) {
 		unary_delim = false;
 		code = unary_op;
 		ps.want_blank = false;
 	    }
 	}
 	break;			/* buffer overflow will be checked at end of
 				 * switch */
 
     case '=':
 	if (ps.in_or_st)
 	    ps.block_init = 1;
 #ifdef undef
 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
 	    e_token[-1] = *buf_ptr++;
 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 		*e_token++ = *buf_ptr++;
 	    *e_token++ = '=';	/* Flip =+ to += */
 	    *e_token = 0;
 	}
 #else
 	if (*buf_ptr == '=') {/* == */
 	    *e_token++ = '=';	/* Flip =+ to += */
 	    buf_ptr++;
 	    *e_token = 0;
 	}
 #endif
 	code = binary_op;
 	unary_delim = true;
 	break;
 	/* can drop thru!!! */
 
     case '>':
     case '<':
     case '!':			/* ops like <, <<, <=, !=, etc */
 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 	    *e_token++ = *buf_ptr;
 	    if (++buf_ptr >= buf_end)
 		fill_buffer();
 	}
 	if (*buf_ptr == '=')
 	    *e_token++ = *buf_ptr++;
 	code = (ps.last_u_d ? unary_op : binary_op);
 	unary_delim = true;
 	break;
 
     default:
 	if (token[0] == '/' && *buf_ptr == '*') {
 	    /* it is start of comment */
 	    *e_token++ = '*';
 
 	    if (++buf_ptr >= buf_end)
 		fill_buffer();
 
 	    code = comment;
 	    unary_delim = ps.last_u_d;
 	    break;
 	}
 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 	    /*
 	     * handle ||, &&, etc, and also things as in int *****i
 	     */
 	    *e_token++ = *buf_ptr;
 	    if (++buf_ptr >= buf_end)
 		fill_buffer();
 	}
 	code = (ps.last_u_d ? unary_op : binary_op);
 	unary_delim = true;
 
 
     }				/* end of switch */
     if (code != newline) {
 	l_struct = false;
 	last_code = code;
     }
     if (buf_ptr >= buf_end)	/* check for input buffer empty */
 	fill_buffer();
     ps.last_u_d = unary_delim;
     *e_token = '\0';		/* null terminate the token */
     return (code);
 }
 
 /*
  * Add the given keyword to the keyword table, using val as the keyword type
  */
 void
 addkey(char *key, int val)
 {
     struct templ *p = specials;
     while (p->rwd)
 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 	    return;
 	else
 	    p++;
     if (p >= specials + sizeof specials / sizeof specials[0])
 	return;			/* For now, table overflows are silently
 				 * ignored */
     p->rwd = key;
     p->rwcode = val;
-    p[1].rwd = 0;
+    p[1].rwd = NULL;
     p[1].rwcode = 0;
 }
Index: user/alc/PQ_LAUNDRY/usr.bin/indent/pr_comment.c
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/indent/pr_comment.c	(revision 303516)
+++ user/alc/PQ_LAUNDRY/usr.bin/indent/pr_comment.c	(revision 303517)
@@ -1,429 +1,429 @@
 /*
  * Copyright (c) 1985 Sun Microsystems, Inc.
  * Copyright (c) 1980, 1993
  *	The Regents of the University of California.  All rights reserved.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)pr_comment.c	8.1 (Berkeley) 6/6/93";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <err.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "indent_globs.h"
 #include "indent.h"
 /*
  * NAME:
  *	pr_comment
  *
  * FUNCTION:
  *	This routine takes care of scanning and printing comments.
  *
  * ALGORITHM:
  *	1) Decide where the comment should be aligned, and if lines should
  *	   be broken.
  *	2) If lines should not be broken and filled, just copy up to end of
  *	   comment.
  *	3) If lines should be filled, then scan thru input_buffer copying
  *	   characters to com_buf.  Remember where the last blank, tab, or
  *	   newline was.  When line is filled, print up to last blank and
  *	   continue copying.
  *
  * HISTORY:
  *	November 1976	D A Willcox of CAC	Initial coding
  *	12/6/76		D A Willcox of CAC	Modification to handle
  *						UNIX-style comments
  *
  */
 
 /*
  * this routine processes comments.  It makes an attempt to keep comments from
  * going over the max line length.  If a line is too long, it moves everything
  * from the last blank to the next comment line.  Blanks and tabs from the
  * beginning of the input line are removed
  */
 
 void
 pr_comment(void)
 {
     int         now_col;	/* column we are in now */
     int         adj_max_col;	/* Adjusted max_col for when we decide to
 				 * spill comments over the right margin */
     char       *last_bl;	/* points to the last blank in the output
 				 * buffer */
     char       *t_ptr;		/* used for moving string */
     int         unix_comment;	/* tri-state variable used to decide if it is
 				 * a unix-style comment. 0 means only blanks
 				 * since /+*, 1 means regular style comment, 2
 				 * means unix style comment */
     int         break_delim = comment_delimiter_on_blankline;
     int         l_just_saw_decl = ps.just_saw_decl;
     /*
      * int         ps.last_nl = 0;	 true iff the last significant thing
      * weve seen is a newline
      */
     int         one_liner = 1;	/* true iff this comment is a one-liner */
     adj_max_col = max_col;
     ps.just_saw_decl = 0;
-    last_bl = 0;		/* no blanks found so far */
+    last_bl = NULL;		/* no blanks found so far */
     ps.box_com = false;		/* at first, assume that we are not in
 					 * a boxed comment or some other
 					 * comment that should not be touched */
     ++ps.out_coms;		/* keep track of number of comments */
     unix_comment = 1;		/* set flag to let us figure out if there is a
 				 * unix-style comment ** DISABLED: use 0 to
 				 * reenable this hack! */
 
     /* Figure where to align and how to treat the comment */
 
     if (ps.col_1 && !format_col1_comments) {	/* if comment starts in column
 						 * 1 it should not be touched */
 	ps.box_com = true;
 	ps.com_col = 1;
     }
     else {
 	if (*buf_ptr == '-' || *buf_ptr == '*' ||
 	    (*buf_ptr == '\n' && !format_block_comments)) {
 	    ps.box_com = true;	/* A comment with a '-' or '*' immediately
 				 * after the /+* is assumed to be a boxed
 				 * comment. A comment with a newline
 				 * immediately after the /+* is assumed to
 				 * be a block comment and is treated as a
 				 * box comment unless format_block_comments
 				 * is nonzero (the default). */
 	    break_delim = 0;
 	}
 	if ( /* ps.bl_line && */ (s_lab == e_lab) && (s_code == e_code)) {
 	    /* klg: check only if this line is blank */
 	    /*
 	     * If this (*and previous lines are*) blank, dont put comment way
 	     * out at left
 	     */
 	    ps.com_col = (ps.ind_level - ps.unindent_displace) * ps.ind_size + 1;
 	    adj_max_col = block_comment_max_col;
 	    if (ps.com_col <= 1)
 		ps.com_col = 1 + !format_col1_comments;
 	}
 	else {
 	    int target_col;
 	    break_delim = 0;
 	    if (s_code != e_code)
 		target_col = count_spaces(compute_code_target(), s_code);
 	    else {
 		target_col = 1;
 		if (s_lab != e_lab)
 		    target_col = count_spaces(compute_label_target(), s_lab);
 	    }
 	    ps.com_col = ps.decl_on_line || ps.ind_level == 0 ? ps.decl_com_ind : ps.com_ind;
 	    if (ps.com_col < target_col)
 		ps.com_col = ((target_col + 7) & ~7) + 1;
 	    if (ps.com_col + 24 > adj_max_col)
 		adj_max_col = ps.com_col + 24;
 	}
     }
     if (ps.box_com) {
 	buf_ptr[-2] = 0;
 	ps.n_comment_delta = 1 - count_spaces(1, in_buffer);
 	buf_ptr[-2] = '/';
     }
     else {
 	ps.n_comment_delta = 0;
 	while (*buf_ptr == ' ' || *buf_ptr == '\t')
 	    buf_ptr++;
     }
     ps.comment_delta = 0;
     *e_com++ = '/';		/* put '/' followed by '*' into buffer */
     *e_com++ = '*';
     if (*buf_ptr != ' ' && !ps.box_com)
 	*e_com++ = ' ';
 
     *e_com = '\0';
     if (troff) {
 	now_col = 1;
 	adj_max_col = 80;
     }
     else
 	now_col = count_spaces(ps.com_col, s_com);	/* figure what column we
 							 * would be in if we
 							 * printed the comment
 							 * now */
 
     /* Start to copy the comment */
 
     while (1) {			/* this loop will go until the comment is
 				 * copied */
 	if (*buf_ptr > 040 && *buf_ptr != '*')
 	    ps.last_nl = 0;
 	CHECK_SIZE_COM;
 	switch (*buf_ptr) {	/* this checks for various spcl cases */
 	case 014:		/* check for a form feed */
 	    if (!ps.box_com) {	/* in a text comment, break the line here */
 		ps.use_ff = true;
 		/* fix so dump_line uses a form feed */
 		dump_line();
-		last_bl = 0;
+		last_bl = NULL;
 		*e_com++ = ' ';
 		*e_com++ = '*';
 		*e_com++ = ' ';
 		while (*++buf_ptr == ' ' || *buf_ptr == '\t');
 	    }
 	    else {
 		if (++buf_ptr >= buf_end)
 		    fill_buffer();
 		*e_com++ = 014;
 	    }
 	    break;
 
 	case '\n':
 	    if (had_eof) {	/* check for unexpected eof */
 		printf("Unterminated comment\n");
 		*e_com = '\0';
 		dump_line();
 		return;
 	    }
 	    one_liner = 0;
 	    if (ps.box_com || ps.last_nl) {	/* if this is a boxed comment,
 						 * we dont ignore the newline */
 		if (s_com == e_com) {
 		    *e_com++ = ' ';
 		    *e_com++ = ' ';
 		}
 		*e_com = '\0';
 		if (!ps.box_com && e_com - s_com > 3) {
 		    if (break_delim == 1 && s_com[0] == '/'
 			    && s_com[1] == '*' && s_com[2] == ' ') {
 			char       *t = e_com;
 			break_delim = 2;
 			e_com = s_com + 2;
 			*e_com = 0;
 			if (blanklines_before_blockcomments)
 			    prefix_blankline_requested = 1;
 			dump_line();
 			e_com = t;
 			s_com[0] = s_com[1] = s_com[2] = ' ';
 		    }
 		    dump_line();
 		    CHECK_SIZE_COM;
 		    *e_com++ = ' ';
 		    *e_com++ = ' ';
 		}
 		dump_line();
 		now_col = ps.com_col;
 	    }
 	    else {
 		ps.last_nl = 1;
 		if (unix_comment != 1) {	/* we not are in unix_style
 						 * comment */
 		    if (unix_comment == 0 && s_code == e_code) {
 			/*
 			 * if it is a UNIX-style comment, ignore the
 			 * requirement that previous line be blank for
 			 * unindention
 			 */
 			ps.com_col = (ps.ind_level - ps.unindent_displace) * ps.ind_size + 1;
 			if (ps.com_col <= 1)
 			    ps.com_col = 2;
 		    }
 		    unix_comment = 2;	/* permanently remember that we are in
 					 * this type of comment */
 		    dump_line();
 		    ++line_no;
 		    now_col = ps.com_col;
 		    *e_com++ = ' ';
 		    /*
 		     * fix so that the star at the start of the line will line
 		     * up
 		     */
 		    do		/* flush leading white space */
 			if (++buf_ptr >= buf_end)
 			    fill_buffer();
 		    while (*buf_ptr == ' ' || *buf_ptr == '\t');
 		    break;
 		}
 		if (*(e_com - 1) == ' ' || *(e_com - 1) == '\t')
 		    last_bl = e_com - 1;
 		/*
 		 * if there was a space at the end of the last line, remember
 		 * where it was
 		 */
 		else {		/* otherwise, insert one */
 		    last_bl = e_com;
 		    CHECK_SIZE_COM;
 		    *e_com++ = ' ';
 		    ++now_col;
 		}
 	    }
 	    ++line_no;		/* keep track of input line number */
 	    if (!ps.box_com) {
 		int         nstar = 1;
 		do {		/* flush any blanks and/or tabs at start of
 				 * next line */
 		    if (++buf_ptr >= buf_end)
 			fill_buffer();
 		    if (*buf_ptr == '*' && --nstar >= 0) {
 			if (++buf_ptr >= buf_end)
 			    fill_buffer();
 			if (*buf_ptr == '/')
 			    goto end_of_comment;
 		    }
 		} while (*buf_ptr == ' ' || *buf_ptr == '\t');
 	    }
 	    else if (++buf_ptr >= buf_end)
 		fill_buffer();
 	    break;		/* end of case for newline */
 
 	case '*':		/* must check for possibility of being at end
 				 * of comment */
 	    if (++buf_ptr >= buf_end)	/* get to next char after * */
 		fill_buffer();
 
 	    if (unix_comment == 0)	/* set flag to show we are not in
 					 * unix-style comment */
 		unix_comment = 1;
 
 	    if (*buf_ptr == '/') {	/* it is the end!!! */
 	end_of_comment:
 		if (++buf_ptr >= buf_end)
 		    fill_buffer();
 
 		if (*(e_com - 1) != ' ' && !ps.box_com) {	/* insure blank before
 								 * end */
 		    *e_com++ = ' ';
 		    ++now_col;
 		}
 		if (break_delim == 1 && !one_liner && s_com[0] == '/'
 			&& s_com[1] == '*' && s_com[2] == ' ') {
 		    char       *t = e_com;
 		    break_delim = 2;
 		    e_com = s_com + 2;
 		    *e_com = 0;
 		    if (blanklines_before_blockcomments)
 			prefix_blankline_requested = 1;
 		    dump_line();
 		    e_com = t;
 		    s_com[0] = s_com[1] = s_com[2] = ' ';
 		}
 		if (break_delim == 2 && e_com > s_com + 3
 			 /* now_col > adj_max_col - 2 && !ps.box_com */ ) {
 		    *e_com = '\0';
 		    dump_line();
 		    now_col = ps.com_col;
 		}
 		CHECK_SIZE_COM;
 		*e_com++ = '*';
 		*e_com++ = '/';
 		*e_com = '\0';
 		ps.just_saw_decl = l_just_saw_decl;
 		return;
 	    }
 	    else {		/* handle isolated '*' */
 		*e_com++ = '*';
 		++now_col;
 	    }
 	    break;
 	default:		/* we have a random char */
 	    if (unix_comment == 0 && *buf_ptr != ' ' && *buf_ptr != '\t')
 		unix_comment = 1;	/* we are not in unix-style comment */
 
 	    *e_com = *buf_ptr++;
 	    if (buf_ptr >= buf_end)
 		fill_buffer();
 
 	    if (*e_com == '\t')	/* keep track of column */
 		now_col = ((now_col - 1) & tabmask) + tabsize + 1;
 	    else if (*e_com == '\b')	/* this is a backspace */
 		--now_col;
 	    else
 		++now_col;
 
 	    if (*e_com == ' ' || *e_com == '\t')
 		last_bl = e_com;
 	    /* remember we saw a blank */
 
 	    ++e_com;
 	    if (now_col > adj_max_col && !ps.box_com && unix_comment == 1 && e_com[-1] > ' ') {
 		/*
 		 * the comment is too long, it must be broken up
 		 */
 		if (break_delim == 1 && s_com[0] == '/'
 			&& s_com[1] == '*' && s_com[2] == ' ') {
 		    char       *t = e_com;
 		    break_delim = 2;
 		    e_com = s_com + 2;
 		    *e_com = 0;
 		    if (blanklines_before_blockcomments)
 			prefix_blankline_requested = 1;
 		    dump_line();
 		    e_com = t;
 		    s_com[0] = s_com[1] = s_com[2] = ' ';
 		}
-		if (last_bl == 0) {	/* we have seen no blanks */
+		if (last_bl == NULL) {	/* we have seen no blanks */
 		    last_bl = e_com;	/* fake it */
 		    *e_com++ = ' ';
 		}
 		*e_com = '\0';	/* print what we have */
 		*last_bl = '\0';
 		while (last_bl > s_com && last_bl[-1] < 040)
 		    *--last_bl = 0;
 		e_com = last_bl;
 		dump_line();
 
 		*e_com++ = ' ';	/* add blanks for continuation */
 		*e_com++ = ' ';
 		*e_com++ = ' ';
 
 		t_ptr = last_bl + 1;
-		last_bl = 0;
+		last_bl = NULL;
 		if (t_ptr >= e_com) {
 		    while (*t_ptr == ' ' || *t_ptr == '\t')
 			t_ptr++;
 		    while (*t_ptr != '\0') {	/* move unprinted part of
 						 * comment down in buffer */
 			if (*t_ptr == ' ' || *t_ptr == '\t')
 			    last_bl = e_com;
 			*e_com++ = *t_ptr++;
 		    }
 		}
 		*e_com = '\0';
 		now_col = count_spaces(ps.com_col, s_com);	/* recompute current
 								 * position */
 	    }
 	    break;
 	}
     }
 }
Index: user/alc/PQ_LAUNDRY/usr.bin/resizewin/resizewin.1
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/resizewin/resizewin.1	(revision 303516)
+++ user/alc/PQ_LAUNDRY/usr.bin/resizewin/resizewin.1	(revision 303517)
@@ -1,64 +1,66 @@
 .\" resizewin
 .\"
 .\" Query terminal for size and inform the kernel
 .\"
 .\" Copyright 2015 EMC / Isilon Storage Division
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd March 17, 2016
+.Dd July 9, 2016
 .Dt RESIZEWIN 1
 .Os
 .Sh NAME
 .Nm resizewin
 .Nd update the kernel window size for the current TTY
 .Sh DESCRIPTION
 Query the terminal emulator window size with the
 .Dv TIOCSWINSZ
 ioctl and set the window size known by the kernel to the new values.
 The terminal is assumed to be VT100/ANSI compatible.
 .Nm
 is functionally similar to
 .Xr resize 1 ,
 which is part of the
 .Xr xterm 1
 distribution.
 However,
 .Nm
 only works with VT100/ANSI-compatible terminals and does
 not emit commands to set environment variables.
 .Pp
 After a terminal window has been resized, running
 .Nm
 updates the kernel's window size to match the new size.
 .Pp
 Note that virtually all modern terninals support VT100/ANSI escape
 sequences, including xterm, konsole, gnome-terminal iTerm,
-Terminal.app, and puTTY.
+Terminal.app, and PuTTY.
 .Sh SEE ALSO
 .Xr resize 1 ,
 .Xr stty 1
 .Sh HISTORY
+The
 .Nm
-appeared in FreeBSD 11.
+command first appeared in
+.Fx 11 .
Index: user/alc/PQ_LAUNDRY
===================================================================
--- user/alc/PQ_LAUNDRY	(revision 303516)
+++ user/alc/PQ_LAUNDRY	(revision 303517)

Property changes on: user/alc/PQ_LAUNDRY
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r303493-303516